topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,67 @@
1
+ """ChatGPT parser for ingestion layer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from typing import Any, Dict
8
+
9
+ from ..log_preview import field_preview
10
+ from ..sources.base import RawRecord
11
+ from ..validation.base import ValidationResult
12
+ from ..validation.schema_registry import validate_schema
13
+ from .base import NormalizedRecord, Parser
14
+
15
+ logger = logging.getLogger("topos.ingestion.parser.chatgpt")
16
+
17
+
18
+ @dataclass
19
+ class ChatGPTParser(Parser):
20
+ dataset_id: str
21
+ _schema_id: str = "chatgpt.conversation.v1" # Default to v1, can be overridden
22
+
23
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
24
+ payload = raw.payload
25
+ role = payload.get("role", "").lower()
26
+ sender_type = "human" if role == "user" else "assistant"
27
+ created_at = payload.get("created_at")
28
+ ts = ""
29
+ if isinstance(created_at, (int, float)):
30
+ from datetime import datetime, timezone
31
+
32
+ ts = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
33
+ elif isinstance(created_at, str):
34
+ ts = created_at
35
+ normalized = {
36
+ "message_id": payload.get("id", raw.record_id),
37
+ "dataset_id": self.dataset_id,
38
+ "thread_id": payload.get("thread_id", ""),
39
+ "ts": ts,
40
+ "sender_type": sender_type,
41
+ "content": payload.get("content", ""),
42
+ }
43
+ # Preserve _metadata if present (for conversation tree reconstruction)
44
+ if "_metadata" in payload:
45
+ normalized["_metadata"] = payload["_metadata"]
46
+ logger.debug(
47
+ "[PIPELINE:PARSER] Parsed record: message_id=%s, sender_type=%s, thread_id=%s, content_preview=%s",
48
+ normalized["message_id"],
49
+ normalized["sender_type"],
50
+ normalized["thread_id"],
51
+ field_preview(normalized.get("content")),
52
+ )
53
+ return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
54
+
55
+ def validate(self, record: RawRecord) -> ValidationResult:
56
+ is_valid, error = validate_schema(record.payload, self.schema_id())
57
+ errors = [] if is_valid else [error or "Invalid record"]
58
+ logger.debug(
59
+ "[PIPELINE:PARSER] Validation result: record_id=%s, is_valid=%s, errors=%s",
60
+ record.record_id,
61
+ is_valid,
62
+ errors,
63
+ )
64
+ return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
65
+
66
+ def schema_id(self) -> str:
67
+ return self._schema_id
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from ..sources.base import RawRecord
6
+ from ..validation.base import ValidationResult
7
+ from .base import NormalizedRecord, Parser
8
+
9
+
10
+ @dataclass
11
+ class GrokParser(Parser):
12
+ dataset_id: str
13
+
14
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
15
+ return NormalizedRecord(record_id=raw.record_id, payload=raw.payload)
16
+
17
+ def validate(self, record: RawRecord) -> ValidationResult:
18
+ return ValidationResult(is_valid=True, errors=[], metadata={})
19
+
20
+ def schema_id(self) -> str:
21
+ return "grok.conversation.v1"
@@ -0,0 +1,97 @@
1
+ """Minimal parsers for messenger ingestion (iMessage, Signal).
2
+
3
+ Maps raw dict/row to normalized chat shape (message_id, thread_id, sender_type, content, ts).
4
+ Full implementation (reading from chat.db / Signal DB) is in Sprints 03 and 04.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from typing import Any, Dict
12
+
13
+ from ..sources.base import RawRecord
14
+ from ..validation.base import ValidationResult
15
+ from ..validation.schema_registry import validate_schema
16
+ from .base import NormalizedRecord, Parser
17
+
18
+ logger = logging.getLogger("topos.ingestion.parser.messenger")
19
+
20
+
21
+ def _normalize_messenger_payload(payload: Dict[str, Any], record_id: str, dataset_id: str) -> Dict[str, Any]:
22
+ """Convert raw messenger record to normalized shape for conversation_messages."""
23
+ role = (payload.get("role") or payload.get("sender_type") or "user").lower()
24
+ sender_type = "human" # Preserve legacy semantics; identity is carried in sender_id.
25
+ created_at = payload.get("created_at") or payload.get("ts")
26
+ ts = ""
27
+ if isinstance(created_at, (int, float)):
28
+ from datetime import datetime, timezone
29
+ try:
30
+ ts = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
31
+ except (OverflowError, OSError, ValueError):
32
+ # Keep ingestion resilient if a source record has an out-of-range timestamp.
33
+ ts = ""
34
+ elif isinstance(created_at, str):
35
+ ts = created_at
36
+ normalized = {
37
+ "message_id": str(payload.get("id") or payload.get("message_id") or record_id),
38
+ "dataset_id": dataset_id,
39
+ "thread_id": str(payload.get("thread_id") or payload.get("conversation_id") or ""),
40
+ "conversation_id": str(payload.get("thread_id") or payload.get("conversation_id") or ""),
41
+ "ts": ts,
42
+ "sender_type": sender_type,
43
+ "content": (payload.get("content") or "") or "",
44
+ }
45
+ if "_metadata" in payload:
46
+ normalized["_metadata"] = payload["_metadata"]
47
+ if payload.get("sender_id") is not None:
48
+ normalized["sender_id"] = str(payload["sender_id"])
49
+ if payload.get("reply_to_message_id") is not None:
50
+ normalized["reply_to_message_id"] = str(payload["reply_to_message_id"])
51
+ if payload.get("message_type") is not None:
52
+ normalized["message_type"] = str(payload["message_type"])
53
+ if payload.get("event_type") is not None:
54
+ normalized["event_type"] = str(payload["event_type"])
55
+ return normalized
56
+
57
+
58
+ @dataclass
59
+ class ImessageParser(Parser):
60
+ """Parser for iMessage normalized records (imessage.messages.v1)."""
61
+
62
+ dataset_id: str
63
+ _schema_id: str = "imessage.messages.v1"
64
+
65
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
66
+ payload = raw.payload
67
+ normalized = _normalize_messenger_payload(payload, raw.record_id, self.dataset_id)
68
+ return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
69
+
70
+ def validate(self, record: RawRecord) -> ValidationResult:
71
+ is_valid, error = validate_schema(record.payload, self._schema_id)
72
+ errors = [] if is_valid else [error or "Invalid record"]
73
+ return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
74
+
75
+ def schema_id(self) -> str:
76
+ return self._schema_id
77
+
78
+
79
+ @dataclass
80
+ class SignalParser(Parser):
81
+ """Parser for Signal normalized records (signal.messages.v1)."""
82
+
83
+ dataset_id: str
84
+ _schema_id: str = "signal.messages.v1"
85
+
86
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
87
+ payload = raw.payload
88
+ normalized = _normalize_messenger_payload(payload, raw.record_id, self.dataset_id)
89
+ return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
90
+
91
+ def validate(self, record: RawRecord) -> ValidationResult:
92
+ is_valid, error = validate_schema(record.payload, self._schema_id)
93
+ errors = [] if is_valid else [error or "Invalid record"]
94
+ return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
95
+
96
+ def schema_id(self) -> str:
97
+ return self._schema_id
@@ -0,0 +1,54 @@
1
+ """Progress tracking for ingestion jobs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from typing import Optional
7
+
8
+
9
+ class IngestionProgress:
10
+ def __init__(self, job_id: str, records_total: Optional[int] = None):
11
+ self.job_id = job_id
12
+ self.records_total = records_total
13
+ self.records_processed = 0
14
+ self.start_time = time.time()
15
+ self.last_update_time = self.start_time
16
+ self.current_step = "parsing"
17
+ self.errors_count = 0
18
+
19
+ def update(self, records_processed: int, current_step: Optional[str] = None) -> None:
20
+ self.records_processed = records_processed
21
+ self.last_update_time = time.time()
22
+ if current_step:
23
+ self.current_step = current_step
24
+
25
+ def get_progress_percent(self) -> float:
26
+ if not self.records_total:
27
+ return 0.0
28
+ return min(100.0, (self.records_processed / self.records_total) * 100.0)
29
+
30
+ def get_estimated_seconds_remaining(self) -> Optional[int]:
31
+ if not self.records_total or self.records_processed == 0:
32
+ return None
33
+ elapsed = time.time() - self.start_time
34
+ if elapsed <= 0:
35
+ return None
36
+ rate = self.records_processed / elapsed
37
+ if rate <= 0:
38
+ return None
39
+ remaining = self.records_total - self.records_processed
40
+ return int(remaining / rate)
41
+
42
+ def should_report(self, min_interval: float = 1.0) -> bool:
43
+ return (time.time() - self.last_update_time) >= min_interval
44
+
45
+ def to_dict(self) -> dict:
46
+ return {
47
+ "job_id": self.job_id,
48
+ "progress_percent": self.get_progress_percent(),
49
+ "records_processed": self.records_processed,
50
+ "records_total": self.records_total,
51
+ "estimated_seconds_remaining": self.get_estimated_seconds_remaining(),
52
+ "current_step": self.current_step,
53
+ "errors_count": self.errors_count,
54
+ }
@@ -0,0 +1,20 @@
1
+ """Source connector registry."""
2
+
3
+ from .base import SourceConnector
4
+ from .calendar import CalendarSourceConnector
5
+ from .chatgpt import ChatGPTSourceConnector
6
+ from .grok import GrokSourceConnector
7
+
8
+ SOURCE_REGISTRY = {
9
+ "chatgpt": ChatGPTSourceConnector,
10
+ "grok": GrokSourceConnector,
11
+ "calendar": CalendarSourceConnector,
12
+ }
13
+
14
+ __all__ = [
15
+ "SourceConnector",
16
+ "ChatGPTSourceConnector",
17
+ "GrokSourceConnector",
18
+ "CalendarSourceConnector",
19
+ "SOURCE_REGISTRY",
20
+ ]
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Literal
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class SourcePayload:
9
+ payload: Dict[str, str]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class RawRecord:
14
+ record_id: str
15
+ payload: Dict[str, str]
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class SourceIdentity:
20
+ source_system: str
21
+ source_record_id: str
22
+ source_export_id: str
23
+
24
+
25
+ class SourceConnector:
26
+ source_name: str
27
+ source_type: Literal["file", "sqlite"]
28
+
29
+ def ingest(self, payload: SourcePayload) -> str:
30
+ raise NotImplementedError
31
+
32
+ def schema(self) -> Dict[str, str]:
33
+ raise NotImplementedError
34
+
35
+ def identity(self, record: RawRecord) -> SourceIdentity:
36
+ raise NotImplementedError
37
+
38
+ def canonical_eligible(self) -> bool:
39
+ raise NotImplementedError
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Literal
5
+
6
+ from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
7
+
8
+
9
+ @dataclass
10
+ class CalendarSourceConnector(SourceConnector):
11
+ source_name: str = "calendar"
12
+ source_type: Literal["file", "sqlite"] = "file"
13
+
14
+ def ingest(self, payload: SourcePayload) -> str:
15
+ _ = payload
16
+ return "calendar.events.v1"
17
+
18
+ def schema(self) -> Dict[str, str]:
19
+ return {"schema_id": "calendar.events.v1"}
20
+
21
+ def identity(self, record: RawRecord) -> SourceIdentity:
22
+ return SourceIdentity(
23
+ source_system="calendar",
24
+ source_record_id=record.record_id,
25
+ source_export_id=record.record_id,
26
+ )
27
+
28
+ def canonical_eligible(self) -> bool:
29
+ return False
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Literal
5
+
6
+ from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
7
+
8
+
9
+ @dataclass
10
+ class ChatGPTSourceConnector(SourceConnector):
11
+ source_name: str = "chatgpt"
12
+ source_type: Literal["file", "sqlite"] = "file"
13
+
14
+ def ingest(self, payload: SourcePayload) -> str:
15
+ _ = payload
16
+ return "chatgpt.conversation.v1"
17
+
18
+ def schema(self) -> Dict[str, str]:
19
+ return {"schema_id": "chatgpt.conversation.v1"}
20
+
21
+ def identity(self, record: RawRecord) -> SourceIdentity:
22
+ return SourceIdentity(
23
+ source_system="chatgpt",
24
+ source_record_id=record.record_id,
25
+ source_export_id=record.record_id,
26
+ )
27
+
28
+ def canonical_eligible(self) -> bool:
29
+ return True
@@ -0,0 +1,274 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import platform
6
+ import re
7
+ import shutil
8
+ import ssl
9
+ import subprocess
10
+ import time
11
+ import urllib.error
12
+ import urllib.parse
13
+ import urllib.request
14
+ from typing import Any, Dict, List, Optional
15
+
16
+
17
+ GOOGLE_DEVICE_CODE_URL = "https://oauth2.googleapis.com/device/code"
18
+ GOOGLE_TOKEN_URL = "https://oauth2.googleapis.com/token"
19
+ GOOGLE_PEOPLE_CONNECTIONS_URL = "https://people.googleapis.com/v1/people/me/connections"
20
+
21
+ logger = logging.getLogger("topos.ingestion.sources.contact_importers")
22
+
23
+
24
+ def _build_ssl_context() -> ssl.SSLContext:
25
+ """
26
+ Build a verified TLS context for outbound Google API calls.
27
+
28
+ On some macOS Python runtimes, default trust roots are missing and
29
+ certificate validation fails. Prefer certifi when available.
30
+ """
31
+ try:
32
+ import certifi
33
+
34
+ return ssl.create_default_context(cafile=certifi.where())
35
+ except Exception:
36
+ return ssl.create_default_context()
37
+
38
+
39
+ def _http_post_form(url: str, body: Dict[str, Any]) -> Dict[str, Any]:
40
+ payload = urllib.parse.urlencode({k: v for k, v in body.items() if v is not None}).encode("utf-8")
41
+ req = urllib.request.Request(
42
+ url=url,
43
+ data=payload,
44
+ method="POST",
45
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
46
+ )
47
+ try:
48
+ with urllib.request.urlopen(req, timeout=30, context=_build_ssl_context()) as resp:
49
+ return json.loads(resp.read().decode("utf-8"))
50
+ except urllib.error.HTTPError as e:
51
+ raw = e.read().decode("utf-8", errors="replace")
52
+ try:
53
+ return json.loads(raw)
54
+ except Exception:
55
+ return {"error": f"http_{e.code}", "error_description": raw}
56
+
57
+
58
+ def _http_get_json(url: str, bearer_token: str) -> Dict[str, Any]:
59
+ req = urllib.request.Request(
60
+ url=url,
61
+ method="GET",
62
+ headers={"Authorization": f"Bearer {bearer_token}"},
63
+ )
64
+ try:
65
+ with urllib.request.urlopen(req, timeout=30, context=_build_ssl_context()) as resp:
66
+ return json.loads(resp.read().decode("utf-8"))
67
+ except urllib.error.HTTPError as e:
68
+ raw = e.read().decode("utf-8", errors="replace")
69
+ try:
70
+ return json.loads(raw)
71
+ except Exception:
72
+ raise RuntimeError(f"Google API request failed: HTTP {e.code}: {raw[:500]}") from e
73
+ except urllib.error.URLError as e:
74
+ raise RuntimeError(
75
+ f"Google API TLS/network error: {e}. "
76
+ "If this is a certificate verify failure, ensure certifi is installed in the engine environment."
77
+ ) from e
78
+
79
+
80
+ def _normalize_phone(value: Any) -> str:
81
+ s = str(value or "").strip()
82
+ if not s:
83
+ return ""
84
+ # Preserve a leading + where present, drop formatting characters.
85
+ plus = s.startswith("+")
86
+ digits = re.sub(r"[^\d]", "", s)
87
+ if not digits:
88
+ return ""
89
+ return f"+{digits}" if plus else digits
90
+
91
+
92
+ def import_apple_contacts_local() -> List[Dict[str, Any]]:
93
+ """
94
+ Read Apple Contacts locally on macOS via JXA (osascript JavaScript bridge).
95
+ Returns normalized records: [{"display_name": str, "identifiers": [{"type","identifier"}]}]
96
+ """
97
+ current_platform = platform.system().lower()
98
+ logger.info("[CONTACT_IMPORT] Apple import start: platform=%s", current_platform)
99
+ if current_platform != "darwin":
100
+ raise RuntimeError("Apple Contacts import is only available on macOS")
101
+ osascript_bin = shutil.which("osascript")
102
+ logger.info("[CONTACT_IMPORT] Apple import environment: osascript=%s", osascript_bin or "missing")
103
+ if not osascript_bin:
104
+ raise RuntimeError("osascript not found in PATH; Apple Contacts import requires macOS host runtime")
105
+
106
+ jxa_script = r"""
107
+ const app = Application("Contacts");
108
+ const people = app.people();
109
+ const out = [];
110
+ for (let i = 0; i < people.length; i++) {
111
+ const p = people[i];
112
+ const name = (() => { try { return String(p.name() || "").trim(); } catch (_) { return ""; } })();
113
+ const phones = [];
114
+ const emails = [];
115
+ try {
116
+ const ph = p.phones();
117
+ for (let j = 0; j < ph.length; j++) {
118
+ const val = String(ph[j].value() || "").trim();
119
+ if (val) phones.push(val);
120
+ }
121
+ } catch (_) {}
122
+ try {
123
+ const em = p.emails();
124
+ for (let j = 0; j < em.length; j++) {
125
+ const val = String(em[j].value() || "").trim();
126
+ if (val) emails.push(val);
127
+ }
128
+ } catch (_) {}
129
+ if (name || phones.length || emails.length) out.push({ name, phones, emails });
130
+ }
131
+ JSON.stringify(out);
132
+ """
133
+ proc = subprocess.run(
134
+ ["osascript", "-l", "JavaScript", "-e", jxa_script],
135
+ capture_output=True,
136
+ text=True,
137
+ check=False,
138
+ )
139
+ if proc.returncode != 0:
140
+ stderr = (proc.stderr or "").strip()
141
+ stdout = (proc.stdout or "").strip()
142
+ logger.error(
143
+ "[CONTACT_IMPORT] Apple import osascript failed: returncode=%s stderr=%s stdout=%s",
144
+ proc.returncode,
145
+ stderr[:500],
146
+ stdout[:500],
147
+ )
148
+ raise RuntimeError((stderr or stdout or "Failed to read Apple Contacts").strip())
149
+ try:
150
+ raw_items = json.loads(proc.stdout or "[]")
151
+ except Exception as e:
152
+ raise RuntimeError(f"Failed to parse Apple Contacts output: {e}") from e
153
+
154
+ imported: List[Dict[str, Any]] = []
155
+ for item in raw_items:
156
+ name = str((item or {}).get("name") or "").strip()
157
+ identifiers: List[Dict[str, str]] = []
158
+ for p in (item or {}).get("phones") or []:
159
+ phone = _normalize_phone(p)
160
+ if phone:
161
+ identifiers.append({"type": "phone", "identifier": phone})
162
+ for e in (item or {}).get("emails") or []:
163
+ email = str(e or "").strip().lower()
164
+ if email:
165
+ identifiers.append({"type": "email", "identifier": email})
166
+ if identifiers:
167
+ imported.append({"display_name": name or None, "identifiers": identifiers})
168
+ logger.info(
169
+ "[CONTACT_IMPORT] Apple import complete: raw_contacts=%d imported_contacts=%d",
170
+ len(raw_items),
171
+ len(imported),
172
+ )
173
+ return imported
174
+
175
+
176
+ def start_google_device_auth(client_id: str) -> Dict[str, Any]:
177
+ if not str(client_id or "").strip():
178
+ raise RuntimeError("google_client_id is required")
179
+ logger.info("[CONTACT_IMPORT] Google device auth start requested")
180
+ result = _http_post_form(
181
+ GOOGLE_DEVICE_CODE_URL,
182
+ {
183
+ "client_id": client_id.strip(),
184
+ "scope": "openid https://www.googleapis.com/auth/contacts.readonly",
185
+ },
186
+ )
187
+ if result.get("error"):
188
+ logger.warning(
189
+ "[CONTACT_IMPORT] Google device auth start failed: error=%s description=%s",
190
+ result.get("error"),
191
+ result.get("error_description"),
192
+ )
193
+ else:
194
+ logger.info("[CONTACT_IMPORT] Google device auth start succeeded")
195
+ return result
196
+
197
+
198
+ def finish_google_device_auth(
199
+ *,
200
+ client_id: str,
201
+ device_code: str,
202
+ interval_seconds: int = 5,
203
+ timeout_seconds: int = 120,
204
+ ) -> Dict[str, Any]:
205
+ logger.info(
206
+ "[CONTACT_IMPORT] Google device auth finish polling start: interval=%s timeout=%s",
207
+ interval_seconds,
208
+ timeout_seconds,
209
+ )
210
+ started = time.time()
211
+ interval = max(2, int(interval_seconds or 5))
212
+ while True:
213
+ result = _http_post_form(
214
+ GOOGLE_TOKEN_URL,
215
+ {
216
+ "client_id": client_id.strip(),
217
+ "device_code": device_code,
218
+ "grant_type": "urn:ietf:params:oauth:grant-type:device_code",
219
+ },
220
+ )
221
+ if result.get("access_token"):
222
+ logger.info("[CONTACT_IMPORT] Google device auth finish succeeded")
223
+ return result
224
+ if result.get("error") not in {"authorization_pending", "slow_down"}:
225
+ logger.warning(
226
+ "[CONTACT_IMPORT] Google device auth finish failed: error=%s description=%s",
227
+ result.get("error"),
228
+ result.get("error_description"),
229
+ )
230
+ return result
231
+ if time.time() - started > max(30, int(timeout_seconds)):
232
+ return {"error": "authorization_timeout", "error_description": "Timed out waiting for Google authorization"}
233
+ if result.get("error") == "slow_down":
234
+ interval += 2
235
+ time.sleep(interval)
236
+
237
+
238
+ def import_google_contacts(access_token: str) -> List[Dict[str, Any]]:
239
+ logger.info("[CONTACT_IMPORT] Google contacts fetch start")
240
+ imported: List[Dict[str, Any]] = []
241
+ page_token: Optional[str] = None
242
+ for _ in range(20):
243
+ params = {
244
+ "pageSize": "1000",
245
+ "personFields": "names,emailAddresses,phoneNumbers",
246
+ "sortOrder": "LAST_MODIFIED_ASCENDING",
247
+ }
248
+ if page_token:
249
+ params["pageToken"] = page_token
250
+ url = f"{GOOGLE_PEOPLE_CONNECTIONS_URL}?{urllib.parse.urlencode(params)}"
251
+ data = _http_get_json(url, access_token)
252
+ for person in data.get("connections") or []:
253
+ name = None
254
+ for n in person.get("names") or []:
255
+ disp = str(n.get("displayName") or "").strip()
256
+ if disp:
257
+ name = disp
258
+ break
259
+ identifiers: List[Dict[str, str]] = []
260
+ for p in person.get("phoneNumbers") or []:
261
+ phone = _normalize_phone(p.get("value"))
262
+ if phone:
263
+ identifiers.append({"type": "phone", "identifier": phone})
264
+ for e in person.get("emailAddresses") or []:
265
+ email = str(e.get("value") or "").strip().lower()
266
+ if email:
267
+ identifiers.append({"type": "email", "identifier": email})
268
+ if identifiers:
269
+ imported.append({"display_name": name, "identifiers": identifiers})
270
+ page_token = data.get("nextPageToken")
271
+ if not page_token:
272
+ break
273
+ logger.info("[CONTACT_IMPORT] Google contacts fetch complete: imported_contacts=%d", len(imported))
274
+ return imported
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Literal
5
+
6
+ from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
7
+
8
+
9
+ @dataclass
10
+ class GrokSourceConnector(SourceConnector):
11
+ source_name: str = "grok"
12
+ source_type: Literal["file", "sqlite"] = "file"
13
+
14
+ def ingest(self, payload: SourcePayload) -> str:
15
+ _ = payload
16
+ return "grok.conversation.v1"
17
+
18
+ def schema(self) -> Dict[str, str]:
19
+ return {"schema_id": "grok.conversation.v1"}
20
+
21
+ def identity(self, record: RawRecord) -> SourceIdentity:
22
+ return SourceIdentity(
23
+ source_system="grok",
24
+ source_record_id=record.record_id,
25
+ source_export_id=record.record_id,
26
+ )
27
+
28
+ def canonical_eligible(self) -> bool:
29
+ return False