topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,427 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from dataclasses import dataclass, fields
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ from ..canonicalization.mappers import MAPPER_REGISTRY
10
+ from ..canonicalization.mappers.base import CanonicalMapper as LegacyCanonicalMapper
11
+ from ..canonicalization.mappers.base import CanonicalRecord, MappingMetadata
12
+ from ..canonicalization.models import CanonicalMessage
13
+ from ..ingestion.parsers import PARSER_REGISTRY
14
+ from ..ingestion.parsers.base import NormalizedRecord, Parser
15
+ from ..ingestion.sources.base import RawRecord
16
+ from ..ingestion.validation.base import ValidationResult
17
+ from ..sources.definitions import DataSourceDefinition
18
+ from ..sources.registry import REGISTRY
19
+ from ..storage.canonical.ai_chat.mapper import CanonicalMapper as StorageCanonicalMapper
20
+ from ..storage.canonical.ai_chat.mapper import register_mapper
21
+ from ..storage.canonical.ai_chat.model import CanonicalAIChatMessage
22
+ from ..storage.canonical.ai_chat import mapper as storage_mapper_module
23
+
24
+
25
+ def _maybe_parse_json(value: Any) -> Any:
26
+ if isinstance(value, str):
27
+ stripped = value.strip()
28
+ if stripped.startswith("{") or stripped.startswith("["):
29
+ try:
30
+ return json.loads(stripped)
31
+ except json.JSONDecodeError:
32
+ return value
33
+ return value
34
+
35
+
36
+ def _parse_source_definition_from_version_row(version_row: Dict[str, Any]) -> Dict[str, Any]:
37
+ source_def = _maybe_parse_json(version_row.get("source_definition_json"))
38
+ if not isinstance(source_def, dict):
39
+ raise ValueError("source_definition_json must be a JSON object in the version row")
40
+
41
+ compatibility = _maybe_parse_json(version_row.get("compatibility_json"))
42
+ if isinstance(compatibility, dict):
43
+ source_def.setdefault("parser_id", compatibility.get("parser_id"))
44
+ source_def.setdefault("canonical_mapper_id", compatibility.get("canonical_mapper_id"))
45
+ source_def.setdefault("source_type", compatibility.get("source_type"))
46
+
47
+ if version_row.get("schema_id") and not source_def.get("schema_id"):
48
+ source_def["schema_id"] = version_row.get("schema_id")
49
+ if version_row.get("source_id") and not source_def.get("source_id"):
50
+ source_def["source_id"] = version_row.get("source_id")
51
+ return source_def
52
+
53
+
54
+ def _build_source_definition(payload: Dict[str, Any]) -> DataSourceDefinition:
55
+ allowed = {f.name for f in fields(DataSourceDefinition)}
56
+ filtered = {k: v for k, v in payload.items() if k in allowed}
57
+ return DataSourceDefinition(**filtered)
58
+
59
+
60
+ def _tokenize(path: str) -> List[str]:
61
+ return [part.strip() for part in str(path).split(".") if part.strip()]
62
+
63
+
64
+ def _walk_step(nodes: List[Any], token: str) -> List[Any]:
65
+ results: List[Any] = []
66
+ if token == "*":
67
+ for node in nodes:
68
+ if isinstance(node, dict):
69
+ results.extend(node.values())
70
+ elif isinstance(node, list):
71
+ results.extend(node)
72
+ return results
73
+
74
+ list_mode = token.endswith("[*]")
75
+ key = token[:-3] if list_mode else token
76
+ for node in nodes:
77
+ if not isinstance(node, dict):
78
+ continue
79
+ if key not in node:
80
+ continue
81
+ value = node.get(key)
82
+ if list_mode:
83
+ if isinstance(value, list):
84
+ results.extend(value)
85
+ elif value is not None:
86
+ results.append(value)
87
+ else:
88
+ results.append(value)
89
+ return results
90
+
91
+
92
+ def _extract_path(payload: Dict[str, Any], path: str) -> Any:
93
+ if not path:
94
+ return None
95
+ nodes: List[Any] = [payload]
96
+ for token in _tokenize(path):
97
+ nodes = _walk_step(nodes, token)
98
+ if not nodes:
99
+ return None
100
+ if len(nodes) == 1:
101
+ return nodes[0]
102
+ return nodes
103
+
104
+
105
+ def _coerce_text(value: Any) -> str:
106
+ if value is None:
107
+ return ""
108
+ if isinstance(value, str):
109
+ return value
110
+ if isinstance(value, (int, float, bool)):
111
+ return str(value)
112
+ if isinstance(value, list):
113
+ parts = [_coerce_text(item) for item in value]
114
+ return "\n".join([part for part in parts if part])
115
+ if isinstance(value, dict):
116
+ return json.dumps(value, ensure_ascii=True)
117
+ return str(value)
118
+
119
+
120
+ def _normalize_ts(value: Any) -> str:
121
+ if value is None:
122
+ return ""
123
+ if isinstance(value, (int, float)):
124
+ return datetime.fromtimestamp(float(value), tz=timezone.utc).isoformat()
125
+ return _coerce_text(value)
126
+
127
+
128
+ def _derive_parser_extract_map_for_direct_table_passthrough(
129
+ source_def_payload: Dict[str, Any],
130
+ ) -> Tuple[Dict[str, Any], bool]:
131
+ file_ingest_shape = source_def_payload.get("file_ingest_shape")
132
+ parser_extract_map: Dict[str, Any] = {}
133
+ if isinstance(file_ingest_shape, dict):
134
+ maybe_map = file_ingest_shape.get("parser_extract_map")
135
+ if isinstance(maybe_map, dict):
136
+ parser_extract_map = dict(maybe_map)
137
+ if parser_extract_map:
138
+ return parser_extract_map, False
139
+
140
+ source_type = str(source_def_payload.get("source_type") or "").strip()
141
+ include_data_table = bool(source_def_payload.get("pipeline_include_data_table"))
142
+ if source_type != "file" or not include_data_table:
143
+ return parser_extract_map, False
144
+
145
+ tables = source_def_payload.get("tables")
146
+ if not isinstance(tables, list) or not tables:
147
+ raise ValueError(
148
+ "pipeline_include_data_table=true requires tables when parser_extract_map is empty"
149
+ )
150
+
151
+ column_names: List[str] = []
152
+ seen: set[str] = set()
153
+ for table in tables:
154
+ if not isinstance(table, dict):
155
+ continue
156
+ columns = table.get("columns")
157
+ if not isinstance(columns, list):
158
+ continue
159
+ for column in columns:
160
+ if not isinstance(column, dict):
161
+ continue
162
+ col_name = str(column.get("name") or "").strip()
163
+ if not col_name or col_name in seen:
164
+ continue
165
+ seen.add(col_name)
166
+ column_names.append(col_name)
167
+
168
+ if not column_names:
169
+ raise ValueError(
170
+ "pipeline_include_data_table=true with empty parser_extract_map requires at least one valid table column name"
171
+ )
172
+
173
+ derived_map = {col_name: col_name for col_name in column_names}
174
+ # Common case: table uses record_id but source rows provide id.
175
+ if "record_id" in derived_map and "id" in seen:
176
+ derived_map["record_id"] = "id"
177
+ return derived_map, True
178
+
179
+
180
+ def _build_dynamic_parser_class(
181
+ *,
182
+ source_def: DataSourceDefinition,
183
+ parser_extract_map: Dict[str, Any],
184
+ parser_id: str,
185
+ requires_canonical_contract: bool,
186
+ direct_table_passthrough: bool,
187
+ ):
188
+ class RuntimeInstalledParser(Parser):
189
+ def __init__(self, dataset_id: str, _schema_id: str = parser_id):
190
+ self.dataset_id = dataset_id
191
+ self._schema_id = _schema_id
192
+
193
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
194
+ payload = raw.payload
195
+ extracted: Dict[str, Any] = {}
196
+ for target_key, path in parser_extract_map.items():
197
+ if isinstance(path, str):
198
+ extracted[target_key] = _extract_path(payload, path)
199
+
200
+ if not requires_canonical_contract:
201
+ if (
202
+ direct_table_passthrough
203
+ and extracted
204
+ and isinstance(payload, dict)
205
+ and payload
206
+ and all(value is None for value in extracted.values())
207
+ ):
208
+ keys_preview = sorted([str(key) for key in payload.keys()])[:10]
209
+ raise ValueError(
210
+ "Direct table passthrough could not map any table columns from payload keys "
211
+ f"{keys_preview}. Define file_ingest_shape.parser_extract_map explicitly."
212
+ )
213
+ if direct_table_passthrough and extracted.get("record_id") is None:
214
+ extracted["record_id"] = raw.record_id
215
+ record_id = _coerce_text(
216
+ extracted.get("record_id")
217
+ or extracted.get("id")
218
+ or raw.record_id
219
+ )
220
+ return NormalizedRecord(record_id=record_id, payload=extracted)
221
+
222
+ message_id = _coerce_text(extracted.get("message_id") or payload.get("id") or raw.record_id)
223
+ conversation_id = _coerce_text(
224
+ extracted.get("conversation_id")
225
+ or extracted.get("thread_id")
226
+ or payload.get("thread_id")
227
+ or payload.get("conversation_id")
228
+ or ""
229
+ )
230
+ sender_hint = _coerce_text(extracted.get("sender_type") or extracted.get("role") or payload.get("role")).lower()
231
+ sender_type = "human" if sender_hint in {"user", "human"} else (sender_hint or "assistant")
232
+ content = _coerce_text(extracted.get("content") or extracted.get("content_rendered") or payload.get("content"))
233
+ ts = _normalize_ts(extracted.get("event_at") or extracted.get("created_at") or payload.get("created_at") or payload.get("ts"))
234
+
235
+ normalized_payload: Dict[str, Any] = {
236
+ "message_id": message_id,
237
+ "dataset_id": self.dataset_id,
238
+ "thread_id": conversation_id,
239
+ "ts": ts,
240
+ "sender_type": sender_type,
241
+ "content": content,
242
+ }
243
+ metadata = extracted.get("metadata_json") or payload.get("metadata")
244
+ if metadata is not None:
245
+ normalized_payload["_metadata"] = metadata if isinstance(metadata, dict) else {"metadata": _coerce_text(metadata)}
246
+ return NormalizedRecord(record_id=message_id, payload=normalized_payload)
247
+
248
+ def validate(self, record: RawRecord) -> ValidationResult:
249
+ if not isinstance(record.payload, dict):
250
+ return ValidationResult(is_valid=False, errors=["Payload must be an object"], metadata={})
251
+ return ValidationResult(
252
+ is_valid=True,
253
+ errors=[],
254
+ metadata={"dynamic_parser_id": parser_id, "source_id": source_def.source_id},
255
+ )
256
+
257
+ def schema_id(self) -> str:
258
+ return self._schema_id
259
+
260
+ RuntimeInstalledParser.__name__ = f"RuntimeInstalledParser_{source_def.source_id.replace('-', '_')}"
261
+ return RuntimeInstalledParser
262
+
263
+
264
+ def _build_dynamic_legacy_mapper_class(source_def: DataSourceDefinition, mapper_id: str):
265
+ class RuntimeInstalledLegacyMapper(LegacyCanonicalMapper):
266
+ version: str = "dynamic.v1"
267
+
268
+ def map(self, normalized: NormalizedRecord) -> CanonicalRecord:
269
+ payload = normalized.payload
270
+ content = _coerce_text(payload.get("content"))
271
+ content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
272
+ message_id = _coerce_text(payload.get("message_id") or normalized.record_id)
273
+ conversation_id = _coerce_text(payload.get("thread_id") or payload.get("conversation_id") or payload.get("dataset_id"))
274
+ if conversation_id and ":" not in conversation_id:
275
+ conversation_id = f"{source_def.source_id}:{conversation_id}"
276
+ metadata: Dict[str, Any] = {"mapper_version": self.version, "mapper_id": mapper_id}
277
+ if "_metadata" in payload:
278
+ metadata["_metadata"] = payload.get("_metadata")
279
+ canonical = CanonicalMessage(
280
+ message_id=message_id,
281
+ conversation_id=conversation_id,
282
+ sender_type=_coerce_text(payload.get("sender_type")),
283
+ content=content,
284
+ ts=_coerce_text(payload.get("ts")) or None,
285
+ source_id=source_def.source_id,
286
+ content_hash=content_hash,
287
+ metadata=metadata,
288
+ )
289
+ return CanonicalRecord(record_id=canonical.message_id, payload=canonical.__dict__)
290
+
291
+ def mapping_metadata(self, normalized: NormalizedRecord) -> MappingMetadata:
292
+ return MappingMetadata(source_id=source_def.source_id, mapping_version=self.version)
293
+
294
+ RuntimeInstalledLegacyMapper.__name__ = f"RuntimeInstalledLegacyMapper_{source_def.source_id.replace('-', '_')}"
295
+ return RuntimeInstalledLegacyMapper
296
+
297
+
298
+ def _build_dynamic_storage_mapper_class(source_def: DataSourceDefinition, mapper_id: str):
299
+ class RuntimeInstalledStorageMapper(StorageCanonicalMapper):
300
+ def map_to_canonical(self, staging_record: Dict[str, Any], source: str) -> List[CanonicalAIChatMessage]:
301
+ conversation_id = self.extract_conversation_id(staging_record)
302
+ actual_source_id = _coerce_text(staging_record.get("source_id") or source_def.source_id)
303
+ message = CanonicalAIChatMessage(
304
+ message_id=_coerce_text(staging_record.get("message_id")),
305
+ conversation_id=conversation_id,
306
+ sender_type=_coerce_text(staging_record.get("sender_type") or "assistant"),
307
+ sender_id=_coerce_text(staging_record.get("sender_id")) or None,
308
+ ts=_coerce_text(staging_record.get("ts")),
309
+ content=_coerce_text(staging_record.get("content")),
310
+ source_id=actual_source_id,
311
+ content_rendered=_coerce_text(staging_record.get("content_rendered")) or None,
312
+ metadata_json={
313
+ "original_source": source,
314
+ "mapper_id": mapper_id,
315
+ "thread_id": staging_record.get("thread_id"),
316
+ "_metadata": staging_record.get("_metadata"),
317
+ },
318
+ )
319
+ return [message]
320
+
321
+ def extract_conversation_id(self, staging_record: Dict[str, Any]) -> str:
322
+ conv = _coerce_text(staging_record.get("thread_id") or staging_record.get("conversation_id") or "")
323
+ if conv and ":" not in conv:
324
+ return f"{source_def.source_id}:{conv}"
325
+ return conv or f"{source_def.source_id}:{_coerce_text(staging_record.get('dataset_id'))}"
326
+
327
+ RuntimeInstalledStorageMapper.__name__ = f"RuntimeInstalledStorageMapper_{source_def.source_id.replace('-', '_')}"
328
+ return RuntimeInstalledStorageMapper
329
+
330
+
331
+ @dataclass
332
+ class RuntimeInstallHandle:
333
+ source_id: str
334
+ parser_ids: List[str]
335
+ canonical_mapper_id: Optional[str]
336
+ _previous_source: Optional[DataSourceDefinition]
337
+ _previous_parsers: Dict[str, Any]
338
+ _previous_legacy_mapper: Optional[Any]
339
+ _previous_storage_mapper: Optional[Any]
340
+
341
+ def uninstall(self) -> None:
342
+ if self._previous_source is None:
343
+ REGISTRY.pop(self.source_id, None)
344
+ else:
345
+ REGISTRY[self.source_id] = self._previous_source
346
+
347
+ for parser_id in self.parser_ids:
348
+ previous = self._previous_parsers.get(parser_id)
349
+ if previous is None:
350
+ PARSER_REGISTRY.pop(parser_id, None)
351
+ else:
352
+ PARSER_REGISTRY[parser_id] = previous
353
+
354
+ if self.canonical_mapper_id:
355
+ mapper_id = self.canonical_mapper_id
356
+ if self._previous_legacy_mapper is None:
357
+ MAPPER_REGISTRY.pop(mapper_id, None)
358
+ else:
359
+ MAPPER_REGISTRY[mapper_id] = self._previous_legacy_mapper
360
+
361
+ if self._previous_storage_mapper is None:
362
+ storage_mapper_module._MAPPER_REGISTRY.pop(mapper_id, None)
363
+ else:
364
+ storage_mapper_module._MAPPER_REGISTRY[mapper_id] = self._previous_storage_mapper
365
+
366
+
367
+ def install_source_definition(source_def_payload: Dict[str, Any]) -> RuntimeInstallHandle:
368
+ source_def = _build_source_definition(source_def_payload)
369
+ source_id = source_def.source_id
370
+
371
+ previous_source = REGISTRY.get(source_id)
372
+ REGISTRY[source_id] = source_def
373
+
374
+ canonical_mapper_id = str(source_def_payload.get("canonical_mapper_id") or "").strip()
375
+ canonical_group_id = str(source_def_payload.get("canonical_group_id") or "").strip()
376
+ canonical_mapping_connected = bool(source_def_payload.get("canonical_mapping_connected"))
377
+ requires_canonical_contract = bool(canonical_mapping_connected or canonical_mapper_id or canonical_group_id)
378
+ parser_extract_map, direct_table_passthrough = _derive_parser_extract_map_for_direct_table_passthrough(
379
+ source_def_payload
380
+ )
381
+
382
+ parser_ids = sorted(
383
+ {
384
+ item
385
+ for item in [str(source_def.schema_id or "").strip(), str(source_def.parser_id or "").strip()]
386
+ if item
387
+ }
388
+ )
389
+ parser_cls = _build_dynamic_parser_class(
390
+ source_def=source_def,
391
+ parser_extract_map=parser_extract_map,
392
+ parser_id=str(source_def.schema_id or source_def.parser_id),
393
+ requires_canonical_contract=requires_canonical_contract,
394
+ direct_table_passthrough=direct_table_passthrough,
395
+ )
396
+ previous_parsers: Dict[str, Any] = {}
397
+ for parser_id in parser_ids:
398
+ previous_parsers[parser_id] = PARSER_REGISTRY.get(parser_id)
399
+ PARSER_REGISTRY[parser_id] = parser_cls
400
+
401
+ mapper_id = str(source_def.canonical_mapper_id or "").strip() or None
402
+ previous_legacy_mapper = None
403
+ previous_storage_mapper = None
404
+ if mapper_id:
405
+ previous_legacy_mapper = MAPPER_REGISTRY.get(mapper_id)
406
+ legacy_mapper_cls = _build_dynamic_legacy_mapper_class(source_def, mapper_id)
407
+ MAPPER_REGISTRY[mapper_id] = legacy_mapper_cls
408
+
409
+ previous_storage_mapper = storage_mapper_module._MAPPER_REGISTRY.get(mapper_id)
410
+ storage_mapper_cls = _build_dynamic_storage_mapper_class(source_def, mapper_id)
411
+ register_mapper(mapper_id, storage_mapper_cls)
412
+
413
+ return RuntimeInstallHandle(
414
+ source_id=source_id,
415
+ parser_ids=parser_ids,
416
+ canonical_mapper_id=mapper_id,
417
+ _previous_source=previous_source,
418
+ _previous_parsers=previous_parsers,
419
+ _previous_legacy_mapper=previous_legacy_mapper,
420
+ _previous_storage_mapper=previous_storage_mapper,
421
+ )
422
+
423
+
424
+ def install_source_from_version_row(version_row: Dict[str, Any]) -> Tuple[RuntimeInstallHandle, Dict[str, Any]]:
425
+ source_def_payload = _parse_source_definition_from_version_row(version_row)
426
+ handle = install_source_definition(source_def_payload)
427
+ return handle, source_def_payload
@@ -0,0 +1 @@
1
+ """Storage layer for Topos."""
@@ -0,0 +1,18 @@
1
+ """Canonical storage abstractions."""
2
+
3
+ from .conversations_tables import (
4
+ ConversationsTablesManager,
5
+ ensure_all_tables,
6
+ ensure_conversation_messages_table,
7
+ ensure_conversations_table,
8
+ )
9
+ from .ai_chat import CanonicalTablesManager, Canonicalizer
10
+
11
+ __all__ = [
12
+ "ConversationsTablesManager",
13
+ "ensure_all_tables",
14
+ "ensure_conversation_messages_table",
15
+ "ensure_conversations_table",
16
+ "CanonicalTablesManager",
17
+ "Canonicalizer",
18
+ ]
@@ -0,0 +1,22 @@
1
+ """Canonical AI chat layer - unified data models for AI chat sources.
2
+
3
+ Migrated from engine/canonical/ (commit 7b709af).
4
+ Maps source-specific staging data (e.g. ChatGPT) into ai_chat_messages / ai_chat_conversations.
5
+ """
6
+
7
+ from .model import CanonicalAIChatModel, CanonicalAIChatMessage, CanonicalAIChatConversation
8
+ from .mapper import CanonicalMapper, get_mapper, ChatGPTToAIChatMapper, StoreMessageToAIChatMapper
9
+ from .tables import CanonicalTablesManager
10
+ from .canonicalizer import Canonicalizer
11
+
12
+ __all__ = [
13
+ "CanonicalAIChatModel",
14
+ "CanonicalAIChatMessage",
15
+ "CanonicalAIChatConversation",
16
+ "CanonicalMapper",
17
+ "get_mapper",
18
+ "ChatGPTToAIChatMapper",
19
+ "StoreMessageToAIChatMapper",
20
+ "CanonicalTablesManager",
21
+ "Canonicalizer",
22
+ ]
@@ -0,0 +1,147 @@
1
+ """Canonicalizer - orchestrates canonicalization of staging data.
2
+
3
+ Migrated from engine/canonical/canonicalizer.py (commit 7b709af).
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from collections import defaultdict
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from .mapper import get_mapper
13
+ from .model import CanonicalAIChatConversation
14
+ from .tables import CanonicalTablesManager
15
+
16
+ logger = logging.getLogger("topos.storage.canonical.ai_chat.canonicalizer")
17
+
18
+
19
+ class Canonicalizer:
20
+ """Orchestrates canonicalization of staging data to canonical models."""
21
+
22
+ def __init__(self, tables_manager: CanonicalTablesManager):
23
+ """Initialize with canonical tables manager."""
24
+ self.tables_manager = tables_manager
25
+
26
+ def canonicalize_staging_batch(
27
+ self,
28
+ staging_records: List[Dict[str, Any]],
29
+ source: str,
30
+ batch_size: int = 1000,
31
+ ) -> Dict[str, Any]:
32
+ """Canonicalize a batch of staging records.
33
+
34
+ Args:
35
+ staging_records: List of records from staging table
36
+ source: Source identifier (e.g., "chatgpt")
37
+ batch_size: Batch size for writing canonical records
38
+
39
+ Returns:
40
+ Dict with canonicalization results:
41
+ {
42
+ "conversations_created": int,
43
+ "messages_created": int,
44
+ "canonical_messages": List[Dict],
45
+ "errors": List[Dict]
46
+ }
47
+ """
48
+ if not staging_records:
49
+ return {
50
+ "conversations_created": 0,
51
+ "messages_created": 0,
52
+ "canonical_messages": [],
53
+ "errors": [],
54
+ }
55
+
56
+ try:
57
+ mapper = get_mapper(source)
58
+ except ValueError as exc:
59
+ logger.error("No mapper found for source %s: %s", source, exc)
60
+ return {
61
+ "conversations_created": 0,
62
+ "messages_created": 0,
63
+ "canonical_messages": [],
64
+ "errors": [{"error": str(exc), "source": source}],
65
+ }
66
+
67
+ canonical_messages: List[Any] = []
68
+ conversation_owners: Dict[str, str] = {}
69
+ errors: List[Dict[str, Any]] = []
70
+
71
+ for record in staging_records:
72
+ try:
73
+ messages = mapper.map_to_canonical(record, source)
74
+ canonical_messages.extend(messages)
75
+ dataset_id = record.get("dataset_id", "")
76
+ owner_user_id = dataset_id.split(":")[0] if ":" in dataset_id else ""
77
+ for msg in messages:
78
+ if msg.conversation_id not in conversation_owners:
79
+ conversation_owners[msg.conversation_id] = owner_user_id
80
+ except Exception as exc:
81
+ logger.error("Failed to map staging record to canonical: %s", exc)
82
+ errors.append({
83
+ "record": record,
84
+ "error": str(exc),
85
+ "source": source,
86
+ })
87
+
88
+ if not canonical_messages:
89
+ return {
90
+ "conversations_created": 0,
91
+ "messages_created": 0,
92
+ "canonical_messages": [],
93
+ "errors": errors,
94
+ }
95
+
96
+ conversations_dict: Dict[str, List[Any]] = defaultdict(list)
97
+ for msg in canonical_messages:
98
+ conversations_dict[msg.conversation_id].append(msg)
99
+
100
+ conversations: List[CanonicalAIChatConversation] = []
101
+ for conversation_id, messages in conversations_dict.items():
102
+ owner_user_id = conversation_owners.get(conversation_id, "")
103
+ timestamps = [msg.ts for msg in messages if msg.ts]
104
+ created_at = min(timestamps) if timestamps else ""
105
+ updated_at = max(timestamps) if timestamps else ""
106
+ conversation = CanonicalAIChatConversation(
107
+ conversation_id=conversation_id,
108
+ owner_user_id=owner_user_id,
109
+ title=None,
110
+ source=source,
111
+ created_at=created_at,
112
+ updated_at=updated_at,
113
+ )
114
+ conversations.append(conversation)
115
+
116
+ conversations_created = 0
117
+ try:
118
+ conversations_created = self.tables_manager.write_conversations_batch(
119
+ conversations, batch_size=batch_size
120
+ )
121
+ except Exception as exc:
122
+ logger.error("Failed to write conversations: %s", exc)
123
+ errors.append({"error": f"Failed to write conversations: {exc}", "source": source})
124
+
125
+ messages_created = 0
126
+ try:
127
+ messages_created = self.tables_manager.write_messages_batch(
128
+ canonical_messages, batch_size=batch_size
129
+ )
130
+ except Exception as exc:
131
+ logger.error("Failed to write messages: %s", exc)
132
+ errors.append({"error": f"Failed to write messages: {exc}", "source": source})
133
+
134
+ for conversation_id in conversations_dict.keys():
135
+ try:
136
+ self.tables_manager.update_message_sequences(conversation_id)
137
+ except Exception as exc:
138
+ logger.warning("Failed to update sequences for conversation %s: %s", conversation_id, exc)
139
+
140
+ canonical_messages_dicts = [msg.to_dict() for msg in canonical_messages]
141
+
142
+ return {
143
+ "conversations_created": conversations_created,
144
+ "messages_created": messages_created,
145
+ "canonical_messages": canonical_messages_dicts,
146
+ "errors": errors,
147
+ }