topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1100 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any, AsyncIterator, Dict, List, Optional
11
+
12
+ from .checkpoints.checkpoint_store import CheckpointStore, IngestionCheckpoint
13
+ from .parser import parse_file
14
+ from .parsers import PARSER_REGISTRY
15
+ from .progress import IngestionProgress
16
+ from .sources.base import RawRecord
17
+ from .state_machine import IngestionJob
18
+ from ..canonicalization.mappers import MAPPER_REGISTRY
19
+ from ..config.settings import settings
20
+ from ..enrichment.derived_tables import DerivedTablesManager
21
+ from ..enrichment.jobs import CANONICAL_JOBS
22
+ from ..enrichment.orchestrator import EnrichmentOrchestrator
23
+ from ..enrichment.progress_bar import ProgressBar
24
+ from ..engine.usage_observation import emit_usage_observation
25
+ from ..sources.registry import REGISTRY
26
+ from ..storage.db.postgres import connect_postgres
27
+ from ..storage.raw.file_store import RawFileStore
28
+ from ..utils.base_object import BaseObject
29
+
30
+ logger = logging.getLogger("topos.ingestion.manager")
31
+
32
+
33
+ def _owner_user_id_from_dataset_id(dataset_id: Optional[str]) -> Optional[str]:
34
+ raw = str(dataset_id or "").strip()
35
+ if not raw or ":" not in raw:
36
+ return None
37
+ owner = raw.split(":", 1)[0].strip()
38
+ return owner or None
39
+
40
+
41
+ def _control_plane_base_url(raw_url: Optional[str]) -> str:
42
+ value = str(raw_url or "").strip()
43
+ if value.startswith("wss://"):
44
+ return value.replace("wss://", "https://").split("/ws/")[0]
45
+ if value.startswith("ws://"):
46
+ return value.replace("ws://", "http://").split("/ws/")[0]
47
+ return value.rstrip("/")
48
+
49
+
50
+ def _filter_unenriched_messages(
51
+ canonical_messages: List[Dict[str, Any]],
52
+ job_names: List[str],
53
+ tables_manager: DerivedTablesManager,
54
+ *,
55
+ source_id: Optional[str] = None,
56
+ dataset_id: Optional[str] = None,
57
+ ) -> List[Dict[str, Any]]:
58
+ """Filter out messages that have already been enriched.
59
+
60
+ Args:
61
+ canonical_messages: List of canonical message dictionaries
62
+ job_names: List of enrichment job names to check
63
+ tables_manager: DerivedTablesManager instance for database access
64
+
65
+ Returns:
66
+ List of messages that haven't been enriched yet
67
+ """
68
+ if not canonical_messages or not job_names:
69
+ return canonical_messages
70
+
71
+ if not tables_manager.conn:
72
+ # No database connection, can't check - return all messages
73
+ logger.debug("[PIPELINE:ENRICHMENT] No database connection, processing all messages")
74
+ return canonical_messages
75
+
76
+ # Create mapping from job name to table name
77
+ job_to_table = {job.get_job_name(): job.get_derived_table() for job in CANONICAL_JOBS}
78
+
79
+ # Get set of message IDs that are already enriched for any of the jobs.
80
+ # Scope checks by source_id and (when available) dataset owner so one source/user
81
+ # does not suppress enrichment for another when message_id collides.
82
+ enriched_message_ids: set[str] = set()
83
+ candidate_ids = sorted(
84
+ {str(msg.get("message_id") or "").strip() for msg in canonical_messages if str(msg.get("message_id") or "").strip()}
85
+ )
86
+ if not candidate_ids:
87
+ return canonical_messages
88
+ owner_user_id = ""
89
+ if dataset_id:
90
+ owner_user_id = dataset_id.split(":", 1)[0].strip() if ":" in dataset_id else str(dataset_id).strip()
91
+
92
+ for job_name in job_names:
93
+ table_name = job_to_table.get(job_name)
94
+ if not table_name:
95
+ logger.warning("[PIPELINE:ENRICHMENT] Unknown job name: %s, skipping check", job_name)
96
+ continue
97
+
98
+ try:
99
+ # Check if table exists
100
+ cursor = tables_manager.conn.execute("""
101
+ SELECT name FROM sqlite_master
102
+ WHERE type='table' AND name=?
103
+ """, (table_name,))
104
+ if not cursor.fetchone():
105
+ # Table doesn't exist yet, no messages are enriched
106
+ continue
107
+
108
+ placeholders = ",".join("?" for _ in candidate_ids)
109
+
110
+ # Prefer scoped join against canonical tables when present.
111
+ params: list[Any] = []
112
+ if source_id:
113
+ if owner_user_id:
114
+ cursor = tables_manager.conn.execute(
115
+ """
116
+ SELECT name FROM sqlite_master
117
+ WHERE type='table' AND name='ai_chat_conversations'
118
+ """
119
+ )
120
+ has_conversations = cursor.fetchone() is not None
121
+ if has_conversations:
122
+ params = [source_id, owner_user_id, *candidate_ids]
123
+ cursor = tables_manager.conn.execute(
124
+ f"""
125
+ SELECT DISTINCT d.message_id
126
+ FROM {table_name} d
127
+ INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
128
+ INNER JOIN ai_chat_conversations c ON c.conversation_id = m.conversation_id
129
+ WHERE m.source_id = ? AND c.owner_user_id = ? AND d.message_id IN ({placeholders})
130
+ """,
131
+ tuple(params),
132
+ )
133
+ else:
134
+ params = [source_id, *candidate_ids]
135
+ cursor = tables_manager.conn.execute(
136
+ f"""
137
+ SELECT DISTINCT d.message_id
138
+ FROM {table_name} d
139
+ INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
140
+ WHERE m.source_id = ? AND d.message_id IN ({placeholders})
141
+ """,
142
+ tuple(params),
143
+ )
144
+ else:
145
+ params = [source_id, *candidate_ids]
146
+ cursor = tables_manager.conn.execute(
147
+ f"""
148
+ SELECT DISTINCT d.message_id
149
+ FROM {table_name} d
150
+ INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
151
+ WHERE m.source_id = ? AND d.message_id IN ({placeholders})
152
+ """,
153
+ tuple(params),
154
+ )
155
+ else:
156
+ cursor = tables_manager.conn.execute(
157
+ f"SELECT DISTINCT message_id FROM {table_name} WHERE message_id IN ({placeholders})",
158
+ tuple(candidate_ids),
159
+ )
160
+ enriched_message_ids.update(str(row[0]) for row in cursor.fetchall() if row and row[0])
161
+ except Exception as e:
162
+ logger.warning(
163
+ "[PIPELINE:ENRICHMENT] Failed to check enriched messages in %s: %s",
164
+ table_name,
165
+ e,
166
+ )
167
+ # On error, assume no messages are enriched (safer to process than skip)
168
+ continue
169
+
170
+ # Filter to only messages that haven't been enriched
171
+ unenriched = [
172
+ msg for msg in canonical_messages
173
+ if msg.get("message_id") not in enriched_message_ids
174
+ ]
175
+
176
+ if len(unenriched) < len(canonical_messages):
177
+ logger.debug(
178
+ "[PIPELINE:ENRICHMENT] Filtered %d already-enriched messages, %d new messages to process",
179
+ len(canonical_messages) - len(unenriched),
180
+ len(unenriched),
181
+ )
182
+
183
+ return unenriched
184
+
185
+
186
+ async def _read_file_bytes(file_path: Path) -> AsyncIterator[bytes]:
187
+ def read_all() -> bytes:
188
+ return file_path.read_bytes()
189
+
190
+ file_data = await asyncio.to_thread(read_all)
191
+ chunk_size = 8192
192
+ for i in range(0, len(file_data), chunk_size):
193
+ yield file_data[i : i + chunk_size]
194
+
195
+
196
+ _SQL_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
197
+
198
+
199
+ def _is_valid_sql_identifier(value: str) -> bool:
200
+ return bool(_SQL_IDENTIFIER_RE.match(value or ""))
201
+
202
+
203
+ def _sql_type_for_source_column(column_type: str) -> str:
204
+ ctype = str(column_type or "").strip().lower()
205
+ if ctype in {"identifier", "text"}:
206
+ return "TEXT"
207
+ if ctype in {"real", "float", "number"}:
208
+ return "REAL"
209
+ if ctype in {"integer", "int"}:
210
+ return "INTEGER"
211
+ if ctype in {"json"}:
212
+ return "TEXT"
213
+ return "TEXT"
214
+
215
+
216
+ def _coerce_table_value(value: Any, *, declared_type: str) -> Any:
217
+ ctype = str(declared_type or "").strip().lower()
218
+ if value is None:
219
+ return None
220
+ if ctype == "json":
221
+ if isinstance(value, str):
222
+ return value
223
+ return json.dumps(value, ensure_ascii=True)
224
+ if isinstance(value, (dict, list)):
225
+ return json.dumps(value, ensure_ascii=True)
226
+ return value
227
+
228
+
229
+ def _tokenize_path(path: str) -> List[str]:
230
+ return [part.strip() for part in str(path).split(".") if part.strip()]
231
+
232
+
233
+ def _walk_path_step(nodes: List[Any], token: str) -> List[Any]:
234
+ out: List[Any] = []
235
+ if token == "*":
236
+ for node in nodes:
237
+ if isinstance(node, dict):
238
+ out.extend(node.values())
239
+ elif isinstance(node, list):
240
+ out.extend(node)
241
+ return out
242
+
243
+ list_mode = token.endswith("[*]")
244
+ key = token[:-3] if list_mode else token
245
+ for node in nodes:
246
+ if not isinstance(node, dict):
247
+ continue
248
+ if key not in node:
249
+ continue
250
+ value = node.get(key)
251
+ if list_mode:
252
+ if isinstance(value, list):
253
+ out.extend(value)
254
+ elif value is not None:
255
+ out.append(value)
256
+ else:
257
+ out.append(value)
258
+ return out
259
+
260
+
261
+ def _extract_path_value(payload: Dict[str, Any], path: str) -> Any:
262
+ if not path:
263
+ return payload
264
+ nodes: List[Any] = [payload]
265
+ for token in _tokenize_path(path):
266
+ nodes = _walk_path_step(nodes, token)
267
+ if not nodes:
268
+ return None
269
+ if len(nodes) == 1:
270
+ return nodes[0]
271
+ return nodes
272
+
273
+
274
+ def _expand_file_records(raw_payload: Dict[str, Any], source_def: Optional[Any]) -> List[Dict[str, Any]]:
275
+ if not isinstance(raw_payload, dict):
276
+ return []
277
+ ingest_shape = getattr(source_def, "file_ingest_shape", None) if source_def else None
278
+ if not isinstance(ingest_shape, dict):
279
+ return [raw_payload]
280
+ record_path = str(ingest_shape.get("raw_record_path") or "").strip()
281
+ if not record_path:
282
+ return [raw_payload]
283
+ extracted = _extract_path_value(raw_payload, record_path)
284
+ if isinstance(extracted, list):
285
+ return [item for item in extracted if isinstance(item, dict)]
286
+ if isinstance(extracted, dict):
287
+ return [extracted]
288
+ return []
289
+
290
+
291
+ def _persist_source_data_tables(
292
+ *,
293
+ db_conn: Any,
294
+ source_def: Optional[Any],
295
+ dataset_id: str,
296
+ normalized_records: List[Any],
297
+ ) -> None:
298
+ # Hosted mode should persist source tables in Postgres so rows survive engine restarts.
299
+ if settings.topos_database_mode == "postgres":
300
+ with connect_postgres() as hosted_conn:
301
+ _persist_source_data_tables_on_connection(
302
+ db_conn=hosted_conn,
303
+ source_def=source_def,
304
+ dataset_id=dataset_id,
305
+ normalized_records=normalized_records,
306
+ )
307
+ return
308
+
309
+ _persist_source_data_tables_on_connection(
310
+ db_conn=db_conn,
311
+ source_def=source_def,
312
+ dataset_id=dataset_id,
313
+ normalized_records=normalized_records,
314
+ )
315
+
316
+
317
+ def _persist_source_data_tables_on_connection(
318
+ *,
319
+ db_conn: Any,
320
+ source_def: Optional[Any],
321
+ dataset_id: str,
322
+ normalized_records: List[Any],
323
+ ) -> None:
324
+ if not db_conn or not source_def:
325
+ return
326
+ if not bool(getattr(source_def, "pipeline_include_data_table", False)):
327
+ return
328
+ tables = getattr(source_def, "tables", None)
329
+ if not isinstance(tables, list) or not tables:
330
+ return
331
+ if not normalized_records:
332
+ return
333
+
334
+ owner_user_id: Optional[str] = None
335
+ tenant_id: Optional[str] = None
336
+ dataset_parts = [part for part in str(dataset_id or "").split(":") if part]
337
+ if dataset_parts:
338
+ owner_user_id = dataset_parts[0]
339
+ if len(dataset_parts) >= 3:
340
+ tenant_id = dataset_parts[2]
341
+
342
+ pooled_scope_columns: List[Dict[str, Any]] = [
343
+ {"name": "dataset_id", "type": "text"},
344
+ {"name": "owner_user_id", "type": "text"},
345
+ {"name": "tenant_id", "type": "text"},
346
+ ]
347
+
348
+ for table in tables:
349
+ if not isinstance(table, dict):
350
+ continue
351
+ table_id = str(table.get("table_id") or "").strip()
352
+ columns = table.get("columns")
353
+ if not table_id or not _is_valid_sql_identifier(table_id):
354
+ logger.warning("[PIPELINE:DATA_TABLE] Skipping invalid table_id=%r", table_id)
355
+ continue
356
+ if not isinstance(columns, list) or not columns:
357
+ continue
358
+
359
+ valid_columns: List[Dict[str, Any]] = []
360
+ for column in columns:
361
+ if not isinstance(column, dict):
362
+ continue
363
+ col_name = str(column.get("name") or "").strip()
364
+ if not col_name or not _is_valid_sql_identifier(col_name):
365
+ continue
366
+ valid_columns.append(column)
367
+ existing_names = {str(col.get("name") or "").strip() for col in valid_columns}
368
+ for pooled_col in pooled_scope_columns:
369
+ pooled_name = str(pooled_col["name"])
370
+ if pooled_name in existing_names:
371
+ continue
372
+ valid_columns.append(dict(pooled_col))
373
+ existing_names.add(pooled_name)
374
+
375
+ if not valid_columns:
376
+ continue
377
+
378
+ defs: List[str] = []
379
+ pk_cols: List[str] = []
380
+ for column in valid_columns:
381
+ col_name = str(column.get("name")).strip()
382
+ col_type = _sql_type_for_source_column(str(column.get("type") or "text"))
383
+ defs.append(f'"{col_name}" {col_type}')
384
+ if bool(column.get("primary_key")):
385
+ pk_cols.append(col_name)
386
+ if pk_cols:
387
+ pk_sql = ", ".join([f'"{name}"' for name in pk_cols])
388
+ defs.append(f"PRIMARY KEY ({pk_sql})")
389
+
390
+ db_conn.execute(f'CREATE TABLE IF NOT EXISTS "{table_id}" ({", ".join(defs)})')
391
+
392
+ is_sqlite = "sqlite" in db_conn.__class__.__module__.lower()
393
+ try:
394
+ if is_sqlite:
395
+ existing_col_rows = db_conn.execute(f'PRAGMA table_info("{table_id}")').fetchall()
396
+ persisted_columns = {
397
+ str(row["name"]) if isinstance(row, dict) else str(row[1])
398
+ for row in existing_col_rows
399
+ }
400
+ else:
401
+ existing_col_rows = db_conn.execute(
402
+ """
403
+ SELECT column_name
404
+ FROM information_schema.columns
405
+ WHERE table_schema='public' AND table_name=%s
406
+ """,
407
+ (table_id,),
408
+ ).fetchall()
409
+ persisted_columns = {str(row[0]) for row in existing_col_rows}
410
+ except Exception:
411
+ persisted_columns = set()
412
+
413
+ for pooled_col in ("dataset_id", "owner_user_id", "tenant_id"):
414
+ if pooled_col in persisted_columns:
415
+ continue
416
+ db_conn.execute(f'ALTER TABLE "{table_id}" ADD COLUMN "{pooled_col}" TEXT')
417
+ persisted_columns.add(pooled_col)
418
+
419
+ column_names = [str(column.get("name")).strip() for column in valid_columns]
420
+ quoted_columns = ", ".join([f'"{name}"' for name in column_names])
421
+ placeholder_token = "?" if is_sqlite else "%s"
422
+ placeholders = ", ".join([placeholder_token] * len(column_names))
423
+ if is_sqlite:
424
+ sql = f'INSERT OR REPLACE INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders})'
425
+ else:
426
+ conflict_cols = [name for name in pk_cols if name in column_names]
427
+ non_pk_cols = [name for name in column_names if name not in conflict_cols]
428
+ if conflict_cols:
429
+ conflict_sql = ", ".join([f'"{name}"' for name in conflict_cols])
430
+ if non_pk_cols:
431
+ update_sql = ", ".join(
432
+ [f'"{name}" = EXCLUDED."{name}"' for name in non_pk_cols]
433
+ )
434
+ sql = (
435
+ f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders}) '
436
+ f'ON CONFLICT ({conflict_sql}) DO UPDATE SET {update_sql}'
437
+ )
438
+ else:
439
+ sql = (
440
+ f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders}) '
441
+ f'ON CONFLICT ({conflict_sql}) DO NOTHING'
442
+ )
443
+ else:
444
+ sql = f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders})'
445
+
446
+ for normalized in normalized_records:
447
+ payload = normalized.payload if hasattr(normalized, "payload") else {}
448
+ if not isinstance(payload, dict):
449
+ continue
450
+ row_values: List[Any] = []
451
+ for column in valid_columns:
452
+ col_name = str(column.get("name")).strip()
453
+ raw_value = payload.get(col_name)
454
+ if raw_value is None and col_name == "dataset_id":
455
+ raw_value = dataset_id
456
+ if raw_value is None and col_name == "owner_user_id":
457
+ raw_value = owner_user_id
458
+ if raw_value is None and col_name == "tenant_id":
459
+ raw_value = tenant_id
460
+ if raw_value is None and col_name == "record_id":
461
+ raw_value = payload.get("id") or payload.get("message_id")
462
+ row_values.append(_coerce_table_value(raw_value, declared_type=str(column.get("type") or "")))
463
+ db_conn.execute(sql, tuple(row_values))
464
+
465
+ db_conn.commit()
466
+
467
+
468
+ async def _try_install_runtime_source_definition_from_control_plane(
469
+ *,
470
+ source_id: Optional[str],
471
+ schema_id: str,
472
+ user_id: Optional[str],
473
+ dataset_id: str,
474
+ progress_api_url: Optional[str],
475
+ progress_api_key: Optional[str],
476
+ ) -> Optional[Any]:
477
+ """Best-effort source install when runtime registry is stale."""
478
+ cp_base = _control_plane_base_url(progress_api_url or settings.topos_control_plane_url)
479
+ if not cp_base:
480
+ return None
481
+ token = str(progress_api_key or settings.topos_key or "").strip()
482
+ if not token:
483
+ return None
484
+ params = {
485
+ "user_id": str(user_id or "").strip(),
486
+ "dataset_id": str(dataset_id or "").strip(),
487
+ }
488
+ if not params["user_id"] or not params["dataset_id"]:
489
+ return None
490
+ try:
491
+ import httpx
492
+ from ..sources.runtime_install import install_source_definition
493
+
494
+ async with httpx.AsyncClient(timeout=15.0) as client:
495
+ resp = await client.get(
496
+ f"{cp_base}/sources",
497
+ params=params,
498
+ headers={"Authorization": f"Bearer {token}"},
499
+ )
500
+ resp.raise_for_status()
501
+ payload = resp.json() if resp.content else {}
502
+ rows = payload.get("sources") if isinstance(payload, dict) else None
503
+ if not isinstance(rows, list):
504
+ return None
505
+ wanted_source_id = str(source_id or "").strip()
506
+ wanted_schema = str(schema_id or "").strip()
507
+ for row in rows:
508
+ if not isinstance(row, dict):
509
+ continue
510
+ row_source_id = str(row.get("source_id") or "").strip()
511
+ row_schema_id = str(row.get("schema_id") or "").strip()
512
+ if wanted_source_id and row_source_id != wanted_source_id:
513
+ continue
514
+ if wanted_schema and row_schema_id != wanted_schema:
515
+ continue
516
+ install_source_definition(row)
517
+ installed = REGISTRY.get(row_source_id)
518
+ if installed:
519
+ logger.info(
520
+ "[PIPELINE:MANAGER] Installed runtime source definition from control-plane: source_id=%s schema_id=%s",
521
+ row_source_id,
522
+ row_schema_id,
523
+ )
524
+ return installed
525
+ except Exception as exc:
526
+ logger.warning(
527
+ "[PIPELINE:MANAGER] Failed to install runtime source definition from control-plane (source_id=%s schema_id=%s): %s",
528
+ source_id,
529
+ schema_id,
530
+ exc,
531
+ )
532
+ return None
533
+
534
+
535
+ @dataclass
536
+ class IngestionManager(BaseObject):
537
+ file_store: RawFileStore
538
+ checkpoint_store: Optional[CheckpointStore] = None
539
+
540
+ def __post_init__(self):
541
+ """Initialize BaseObject after dataclass initialization."""
542
+ # Generate name if not set (dataclass doesn't call __init__)
543
+ if not hasattr(self, "_name"):
544
+ from ..utils.base_object import _next_instance_number
545
+ n = _next_instance_number(self.__class__)
546
+ object.__setattr__(self, "_name", f"{self.__class__.__name__}#{n}")
547
+ # Call parent __init__ to ensure BaseObject is properly initialized
548
+ BaseObject.__init__(self, name=getattr(self, "_name", None))
549
+
550
+ async def process_job(
551
+ self,
552
+ job: IngestionJob,
553
+ source_id: Optional[str] = None,
554
+ progress_api_url: Optional[str] = None,
555
+ progress_api_key: Optional[str] = None,
556
+ ) -> Dict[str, Any]:
557
+ file_path = self.file_store.get_file_path(job.dataset_id, job.schema_id)
558
+ if not file_path.exists():
559
+ raise FileNotFoundError(f"Raw file not found: {file_path}")
560
+
561
+ parser_cls = PARSER_REGISTRY.get(job.schema_id)
562
+ if not parser_cls:
563
+ raise ValueError(f"No parser registered for schema: {job.schema_id}")
564
+
565
+ logger.debug(
566
+ "[PIPELINE:MANAGER] %s: Starting job processing: job_id=%s, dataset_id=%s, schema_id=%s, source_id=%s, file_path=%s",
567
+ self,
568
+ job.job_id,
569
+ job.dataset_id,
570
+ job.schema_id,
571
+ source_id,
572
+ file_path,
573
+ )
574
+ # Instantiate parser with schema_id (for v2 support)
575
+ parser = parser_cls(dataset_id=job.dataset_id, _schema_id=job.schema_id)
576
+
577
+ # Try to count total records for progress tracking (optional, may be None)
578
+ records_total = None
579
+ try:
580
+ # Count lines in file (approximation for JSONL)
581
+ if file_format == "jsonl":
582
+ with open(file_path, 'rb') as f:
583
+ records_total = sum(1 for _ in f)
584
+ except Exception:
585
+ pass # If counting fails, records_total remains None
586
+
587
+ progress = IngestionProgress(job_id=job.job_id, records_total=records_total)
588
+ progress_context = {
589
+ "user_id": _owner_user_id_from_dataset_id(job.dataset_id),
590
+ "dataset_id": job.dataset_id,
591
+ }
592
+
593
+ # Send initial progress update
594
+ if progress_api_url and progress_api_key:
595
+ try:
596
+ import httpx
597
+ async with httpx.AsyncClient(timeout=10.0) as client:
598
+ await client.post(
599
+ f"{progress_api_url}/v1/ingestion/progress",
600
+ json={
601
+ "job_id": job.job_id,
602
+ **progress_context,
603
+ "status": "processing",
604
+ "progress_percent": 0.0,
605
+ "records_processed": 0,
606
+ "records_total": records_total,
607
+ "current_step": "starting",
608
+ },
609
+ headers={"Authorization": f"Bearer {progress_api_key}"},
610
+ )
611
+ except Exception as exc:
612
+ logger.warning("Failed to send initial ingestion progress: %s", exc)
613
+
614
+ # Find source definition: use source_id if provided, otherwise find by schema_id
615
+ source_def = None
616
+ if source_id:
617
+ source_def = REGISTRY.get(source_id)
618
+ if source_def:
619
+ logger.info(
620
+ "[PIPELINE:MANAGER] %s: Using source from source_id=%s: %s (enrichment_trigger=%s)",
621
+ self,
622
+ source_id,
623
+ source_def.display_name,
624
+ getattr(source_def, "enrichment_trigger", "not_set"),
625
+ )
626
+ else:
627
+ logger.warning(
628
+ "[PIPELINE:MANAGER] %s: source_id=%s not found in registry, falling back to schema_id lookup",
629
+ self,
630
+ source_id,
631
+ )
632
+
633
+ if not source_def:
634
+ # Fallback: find by schema_id (prefer file type for file ingestion)
635
+ for source in REGISTRY.values():
636
+ if source.schema_id == job.schema_id:
637
+ # Prefer file type sources for file ingestion
638
+ if source.source_type == "file":
639
+ source_def = source
640
+ logger.debug(
641
+ "[PIPELINE:MANAGER] %s: Found file source by schema_id: source_id=%s",
642
+ self,
643
+ source.source_id,
644
+ )
645
+ break
646
+ elif not source_def:
647
+ # Keep first match as fallback
648
+ source_def = source
649
+ if source_def:
650
+ logger.info(
651
+ "[PIPELINE:MANAGER] %s: Found source by schema_id: source_id=%s, source_type=%s, enrichment_trigger=%s",
652
+ self,
653
+ source_def.source_id,
654
+ source_def.source_type,
655
+ getattr(source_def, "enrichment_trigger", "not_set"),
656
+ )
657
+ else:
658
+ source_def = await _try_install_runtime_source_definition_from_control_plane(
659
+ source_id=source_id,
660
+ schema_id=job.schema_id,
661
+ user_id=_owner_user_id_from_dataset_id(job.dataset_id),
662
+ dataset_id=job.dataset_id,
663
+ progress_api_url=progress_api_url,
664
+ progress_api_key=progress_api_key,
665
+ )
666
+
667
+ # Get canonical mapper if available
668
+ canonical_mapper = None
669
+ if source_def and source_def.canonical_mapper_id:
670
+ mapper_cls = MAPPER_REGISTRY.get(source_def.canonical_mapper_id)
671
+ if mapper_cls:
672
+ canonical_mapper = mapper_cls()
673
+
674
+ # Initialize enrichment orchestrator with a real connection, even outside app startup.
675
+ from ..core.state import get_db_connection
676
+
677
+ db_conn = get_db_connection()
678
+ tables_manager = DerivedTablesManager(conn=db_conn) if db_conn else None
679
+ enrichment_orchestrator = EnrichmentOrchestrator(tables_manager=tables_manager) if tables_manager else None
680
+
681
+ records_processed = 0
682
+ errors: list[dict] = []
683
+ last_record_id: Optional[str] = None
684
+ normalized_records: List[Any] = []
685
+
686
+ # Use TUI progress bar for better terminal display (single-line updates)
687
+ # If records_total is None, we'll update it as we go
688
+ pbar = None
689
+ if records_total:
690
+ pbar = ProgressBar(total=records_total, desc=f"{self}: Parsing")
691
+ else:
692
+ # Create progress bar with placeholder total, will update dynamically
693
+ pbar = ProgressBar(total=1000, desc=f"{self}: Parsing") # Placeholder, will adjust
694
+
695
+ try:
696
+ async for raw_payload in parse_file(_read_file_bytes(file_path), job.metadata.get("file_format", "jsonl")):
697
+ expanded_payloads = _expand_file_records(raw_payload, source_def)
698
+ if not expanded_payloads:
699
+ expanded_payloads = [raw_payload] if isinstance(raw_payload, dict) else []
700
+ for record_payload in expanded_payloads:
701
+ record_id = (
702
+ str(record_payload.get("id"))
703
+ or str(record_payload.get("message_id"))
704
+ or f"{records_processed + 1}"
705
+ )
706
+ raw_content = record_payload.get("content")
707
+ if isinstance(raw_content, str):
708
+ content_preview = raw_content[:100]
709
+ else:
710
+ content_preview = str(raw_content)[:100]
711
+ logger.debug(
712
+ "[PIPELINE:MANAGER] %s: Processing raw record: record_id=%s, content_preview=%s",
713
+ self,
714
+ record_id,
715
+ content_preview,
716
+ )
717
+ raw_record = RawRecord(record_id=record_id, payload=record_payload)
718
+ validation = parser.validate(raw_record)
719
+ if not validation.is_valid:
720
+ logger.debug(
721
+ "[PIPELINE:MANAGER] %s: Validation failed: record_id=%s, errors=%s",
722
+ self,
723
+ record_id,
724
+ validation.errors,
725
+ )
726
+ errors.append({"record_id": record_id, "errors": validation.errors})
727
+ if pbar:
728
+ pbar.update(1) # Still count invalid records
729
+ continue
730
+ normalized = parser.parse(raw_record)
731
+ logger.debug(
732
+ "[PIPELINE:NORMALIZED] Record normalized: record_id=%s, fields=%s",
733
+ normalized.record_id,
734
+ sorted(list(normalized.payload.keys()))[:12],
735
+ )
736
+ normalized_records.append(normalized)
737
+ records_processed += 1
738
+ last_record_id = record_id
739
+ progress.update(records_processed, current_step="parsing")
740
+
741
+ # Update progress bar
742
+ if pbar:
743
+ # If we didn't know total initially, update it now
744
+ if not records_total and records_processed > pbar.total:
745
+ # Estimate: assume we're at least 10% done, so total is at least 10x current
746
+ pbar.total = max(pbar.total, records_processed * 10)
747
+ pbar.update(1)
748
+
749
+ # Report progress to control plane if configured
750
+ if progress.should_report() and progress_api_url and progress_api_key:
751
+ try:
752
+ import httpx
753
+ progress_dict = progress.to_dict()
754
+ async with httpx.AsyncClient(timeout=10.0) as client:
755
+ await client.post(
756
+ f"{progress_api_url}/v1/ingestion/progress",
757
+ json={
758
+ "job_id": job.job_id,
759
+ **progress_context,
760
+ **progress_dict,
761
+ },
762
+ headers={"Authorization": f"Bearer {progress_api_key}"},
763
+ )
764
+ except Exception as exc:
765
+ logger.warning("Failed to send ingestion progress update: %s", exc)
766
+
767
+ if progress.should_report():
768
+ logger.debug("Ingestion progress: %s", progress.to_dict())
769
+ except Exception:
770
+ # Re-raise exception but ensure progress bar state is preserved
771
+ raise
772
+ finally:
773
+ # Progress bar will be closed at the end of the function
774
+ pass
775
+
776
+ # Update progress bar: parsing complete, move to canonicalization
777
+ if pbar:
778
+ # Update total if we now know it
779
+ if not records_total and records_processed > 0:
780
+ pbar.total = records_processed
781
+ pbar.n = records_processed # Set current to match
782
+ pbar.set_description(f"{self}: Canonicalizing")
783
+ pbar._display() # Force display update
784
+
785
+ # Update progress: parsing complete
786
+ if progress_api_url and progress_api_key:
787
+ try:
788
+ import httpx
789
+ progress_dict = progress.to_dict()
790
+ async with httpx.AsyncClient(timeout=10.0) as client:
791
+ await client.post(
792
+ f"{progress_api_url}/v1/ingestion/progress",
793
+ json={
794
+ "job_id": job.job_id,
795
+ **progress_context,
796
+ "current_step": "canonicalizing",
797
+ **progress_dict,
798
+ },
799
+ headers={"Authorization": f"Bearer {progress_api_key}"},
800
+ )
801
+ except Exception as exc:
802
+ logger.warning("Failed to send parsing complete progress: %s", exc)
803
+
804
+ # Persist parser output into source-defined logical tables when configured.
805
+ if source_def and db_conn and normalized_records:
806
+ try:
807
+ _persist_source_data_tables(
808
+ db_conn=db_conn,
809
+ source_def=source_def,
810
+ dataset_id=job.dataset_id,
811
+ normalized_records=normalized_records,
812
+ )
813
+ except Exception as exc:
814
+ logger.error(
815
+ "[PIPELINE:DATA_TABLE] %s: Failed to persist source table rows: %s",
816
+ self,
817
+ exc,
818
+ exc_info=True,
819
+ )
820
+ errors.append({"step": "source_data_table", "errors": [str(exc)]})
821
+
822
+ # Canonicalize normalized records: conversations group -> conversation_messages; else engine ai_chat_*
823
+ canonical_messages: List[Dict[str, Any]] = []
824
+ if source_def and normalized_records:
825
+ # Build staging records once (same shape for both paths)
826
+ staging_records = []
827
+ for normalized in normalized_records:
828
+ staging_record = {
829
+ "message_id": normalized.payload.get("message_id"),
830
+ "dataset_id": job.dataset_id,
831
+ "thread_id": normalized.payload.get("thread_id") or normalized.payload.get("conversation_id") or job.dataset_id,
832
+ "ts": normalized.payload.get("ts") or normalized.payload.get("created_at") or str(datetime.now(timezone.utc).timestamp()),
833
+ "sender_type": normalized.payload.get("sender_type"),
834
+ "content": normalized.payload.get("content"),
835
+ "source_id": source_def.source_id,
836
+ }
837
+ if "_metadata" in normalized.payload:
838
+ staging_record["_metadata"] = normalized.payload["_metadata"]
839
+ staging_records.append(staging_record)
840
+
841
+ canonical_group_id = getattr(source_def, "canonical_group_id", None)
842
+ if canonical_group_id == "conversations":
843
+ # Conversations canonical: write only to conversation_messages / conversations (never ai_chat_*)
844
+ from ..core.state import get_db_connection
845
+ from ..storage.canonical import ConversationsTablesManager
846
+ db_conn = get_db_connection()
847
+ if db_conn:
848
+ conv_manager = ConversationsTablesManager(db_conn)
849
+ canonical_result = conv_manager.upsert_message_batch(
850
+ staging_records, job.dataset_id, source_def.source_id
851
+ )
852
+ logger.debug(
853
+ "[PIPELINE:CANONICAL] %s: Conversations canonical: messages_created=%s, conversations_created=%s",
854
+ self,
855
+ canonical_result.get("messages_created", 0),
856
+ canonical_result.get("conversations_created", 0),
857
+ )
858
+ for staging_record in staging_records:
859
+ import json as _json
860
+ metadata_json = None
861
+ if "_metadata" in staging_record:
862
+ metadata_json = _json.dumps(staging_record["_metadata"])
863
+ canonical_messages.append({
864
+ "message_id": staging_record.get("message_id"),
865
+ "conversation_id": staging_record.get("thread_id") or staging_record.get("conversation_id") or job.dataset_id,
866
+ "sender_type": staging_record.get("sender_type"),
867
+ "sender_id": None,
868
+ "ts": staging_record.get("ts"),
869
+ "content": staging_record.get("content"),
870
+ "content_rendered": None,
871
+ "metadata_json": metadata_json,
872
+ "seq": 0,
873
+ "source_id": source_def.source_id,
874
+ })
875
+ elif source_def.canonical_mapper_id:
876
+ # Engine path: ai_chat_messages / ai_chat_conversations
877
+ try:
878
+ from ..storage.canonical.ai_chat import CanonicalTablesManager, Canonicalizer
879
+ from ..core.state import get_db_connection
880
+
881
+ db_conn = get_db_connection()
882
+ canonical_tables_manager = CanonicalTablesManager(db_conn) if db_conn else None
883
+ if canonical_tables_manager:
884
+ canonicalizer = Canonicalizer(canonical_tables_manager)
885
+ mapper_source = source_def.canonical_mapper_id
886
+ logger.debug(
887
+ "[PIPELINE:CANONICAL] %s: Canonicalizing %d records with mapper=%s (source_id=%s)",
888
+ self,
889
+ len(staging_records),
890
+ mapper_source,
891
+ source_def.source_id,
892
+ )
893
+ canonical_result = canonicalizer.canonicalize_staging_batch(
894
+ staging_records, source=mapper_source, batch_size=1000
895
+ )
896
+ # Enrichment should consume canonicalized rows (not pre-mapper staging rows).
897
+ mapped_messages = canonical_result.get("canonical_messages")
898
+ if isinstance(mapped_messages, list):
899
+ canonical_messages.extend(
900
+ [msg for msg in mapped_messages if isinstance(msg, dict)]
901
+ )
902
+ logger.debug(
903
+ "[PIPELINE:CANONICAL] %s: Canonicalization complete: messages_created=%s, conversations_created=%s, canonical_messages_count=%s",
904
+ self,
905
+ canonical_result.get("messages_created", 0),
906
+ canonical_result.get("conversations_created", 0),
907
+ len(canonical_messages),
908
+ )
909
+ else:
910
+ logger.warning("[PIPELINE:CANONICAL] %s: No database connection, skipping canonicalization", self)
911
+ except ImportError as e:
912
+ logger.warning("[PIPELINE:CANONICAL] %s: Canonicalization modules not available: %s. Using fallback mapper.", self, e)
913
+ if canonical_mapper:
914
+ for normalized in normalized_records:
915
+ try:
916
+ canonical = canonical_mapper.map(normalized)
917
+ if source_def:
918
+ canonical.payload["source_id"] = source_def.source_id
919
+ canonical_messages.append(canonical.payload)
920
+ except Exception as exc:
921
+ logger.error("[PIPELINE:CANONICAL] %s: Failed to canonicalize record %s: %s", self, normalized.record_id, exc)
922
+ errors.append({"record_id": normalized.record_id, "errors": [str(exc)]})
923
+ except Exception as exc:
924
+ logger.error("[PIPELINE:CANONICAL] %s: Failed to canonicalize records: %s", self, exc, exc_info=True)
925
+ errors.append({"step": "canonicalization", "errors": [str(exc)]})
926
+
927
+ # Run enrichment on canonical messages (only if automatic trigger)
928
+ if canonical_messages and source_def and source_def.canonical_enrichment_jobs:
929
+ # Get enrichment trigger - explicitly check attribute, default to "automatic" if not set
930
+ enrichment_trigger = getattr(source_def, "enrichment_trigger", "automatic")
931
+
932
+ logger.info(
933
+ "[PIPELINE:ENRICHMENT] %s: Enrichment trigger check: source_id=%s, enrichment_trigger=%s, canonical_messages=%d, jobs=%s",
934
+ self,
935
+ source_def.source_id if source_def else "unknown",
936
+ enrichment_trigger,
937
+ len(canonical_messages),
938
+ source_def.canonical_enrichment_jobs,
939
+ )
940
+
941
+ # Explicitly check for "manual" trigger - skip enrichment if manual
942
+ if enrichment_trigger == "manual":
943
+ logger.info(
944
+ "[PIPELINE:ENRICHMENT] %s: ✅ SKIPPING enrichment (manual trigger): %d canonical messages will be enriched later via POST /v1/enrichment/process",
945
+ self,
946
+ len(canonical_messages),
947
+ )
948
+ # Do NOT run enrichment - return early from this block
949
+ elif enrichment_trigger == "automatic":
950
+ # Only run enrichment if explicitly set to "automatic"
951
+ logger.info(
952
+ "[PIPELINE:ENRICHMENT] %s: Running enrichment (automatic trigger)",
953
+ self,
954
+ )
955
+ # Automatic trigger - run enrichment now
956
+ # Filter out messages that are already enriched
957
+ unenriched_messages = _filter_unenriched_messages(
958
+ canonical_messages,
959
+ source_def.canonical_enrichment_jobs,
960
+ tables_manager,
961
+ source_id=source_def.source_id,
962
+ dataset_id=job.dataset_id,
963
+ )
964
+
965
+ if not unenriched_messages:
966
+ logger.debug(
967
+ "[PIPELINE:ENRICHMENT] %s: All %d messages already enriched, skipping",
968
+ self,
969
+ len(canonical_messages),
970
+ )
971
+ else:
972
+ if not enrichment_orchestrator:
973
+ logger.error(
974
+ "[PIPELINE:ENRICHMENT] %s: Cannot run enrichment - enrichment_orchestrator not initialized",
975
+ self,
976
+ )
977
+ errors.append({"step": "enrichment", "errors": ["Enrichment orchestrator not initialized"]})
978
+ else:
979
+ logger.info(
980
+ "[PIPELINE:ENRICHMENT] %s → %s: Starting enrichment (automatic): %d new messages (out of %d total), jobs=%s",
981
+ self,
982
+ enrichment_orchestrator,
983
+ len(unenriched_messages),
984
+ len(canonical_messages),
985
+ source_def.canonical_enrichment_jobs,
986
+ )
987
+ try:
988
+ enrichment_result = await enrichment_orchestrator.run_canonical(
989
+ unenriched_messages,
990
+ job_names=source_def.canonical_enrichment_jobs,
991
+ )
992
+ logger.info(
993
+ "[PIPELINE:ENRICHMENT] %s → %s: Enrichment complete: jobs_run=%s, records_created=%s, errors=%s",
994
+ self,
995
+ enrichment_orchestrator,
996
+ enrichment_result.get("jobs_run"),
997
+ enrichment_result.get("records_created"),
998
+ len(enrichment_result.get("errors", [])),
999
+ )
1000
+ if enrichment_result.get("errors"):
1001
+ errors.extend(enrichment_result["errors"])
1002
+ except Exception as exc:
1003
+ logger.error(
1004
+ "[PIPELINE:ENRICHMENT] %s → %s: Enrichment failed: %s",
1005
+ self,
1006
+ enrichment_orchestrator,
1007
+ exc,
1008
+ exc_info=True,
1009
+ )
1010
+ errors.append({"step": "enrichment", "errors": [str(exc)]})
1011
+
1012
+ if self.checkpoint_store and last_record_id:
1013
+ checkpoint = IngestionCheckpoint(
1014
+ dataset_id=job.dataset_id,
1015
+ schema_id=job.schema_id,
1016
+ last_record_id=last_record_id,
1017
+ metadata={"file_path": str(file_path)},
1018
+ )
1019
+ self.checkpoint_store.save_checkpoint(checkpoint)
1020
+
1021
+ logger.debug(
1022
+ "[PIPELINE:MANAGER] %s: Job complete: job_id=%s, records_processed=%s, errors_count=%s, last_record_id=%s",
1023
+ self,
1024
+ job.job_id,
1025
+ records_processed,
1026
+ len(errors),
1027
+ last_record_id,
1028
+ )
1029
+
1030
+ # Update progress: set records_total if we now know it
1031
+ if not progress.records_total and records_processed > 0:
1032
+ progress.records_total = records_processed
1033
+
1034
+ # Finalize progress bar
1035
+ if pbar:
1036
+ # Ensure progress bar shows 100%
1037
+ if records_processed > 0:
1038
+ pbar.total = records_processed
1039
+ pbar.n = records_processed
1040
+ pbar.set_description(f"{self}: Complete")
1041
+ pbar._display()
1042
+ pbar.close()
1043
+
1044
+ # Send final progress update
1045
+ if progress_api_url and progress_api_key:
1046
+ try:
1047
+ import httpx
1048
+ progress_dict = progress.to_dict()
1049
+ progress_dict["progress_percent"] = 100.0 # Ensure 100% on completion
1050
+ async with httpx.AsyncClient(timeout=10.0) as client:
1051
+ await client.post(
1052
+ f"{progress_api_url}/v1/ingestion/progress",
1053
+ json={
1054
+ "job_id": job.job_id,
1055
+ **progress_context,
1056
+ "status": "completed",
1057
+ "current_step": "completed",
1058
+ **progress_dict,
1059
+ },
1060
+ headers={"Authorization": f"Bearer {progress_api_key}"},
1061
+ )
1062
+ except Exception as exc:
1063
+ logger.warning("Failed to send final ingestion progress: %s", exc)
1064
+
1065
+ file_size_bytes = 0
1066
+ try:
1067
+ file_size_bytes = int(file_path.stat().st_size)
1068
+ except Exception:
1069
+ file_size_bytes = 0
1070
+ quantity_mb = int((max(0, file_size_bytes) + (1024 * 1024) - 1) // (1024 * 1024))
1071
+ await emit_usage_observation(
1072
+ action="ingestion.file_processed",
1073
+ quantity=quantity_mb,
1074
+ producer="ingestion.manager",
1075
+ canonical_action_identity={
1076
+ "job_id": job.job_id,
1077
+ "dataset_id": job.dataset_id,
1078
+ "schema_id": job.schema_id,
1079
+ "source_id": source_id or "",
1080
+ "records_processed": records_processed,
1081
+ },
1082
+ topos_id=job.dataset_id,
1083
+ trust_class="cp_observed_self_hosted",
1084
+ metadata={"file_size_bytes": file_size_bytes, "quantity_mb": quantity_mb},
1085
+ )
1086
+
1087
+ # Include progress information in return (for progress bar)
1088
+ progress_dict = progress.to_dict()
1089
+
1090
+ return {
1091
+ "job_id": job.job_id,
1092
+ "records_processed": records_processed,
1093
+ "errors_count": len(errors),
1094
+ "errors": errors[:100],
1095
+ # Include progress for progress bar
1096
+ "progress_percent": progress_dict.get("progress_percent", 0.0),
1097
+ "records_total": progress_dict.get("records_total"),
1098
+ "estimated_seconds_remaining": progress_dict.get("estimated_seconds_remaining"),
1099
+ "current_step": progress_dict.get("current_step"),
1100
+ }