topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,959 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from fastapi import APIRouter, Body, Depends, HTTPException
8
+
9
+ from ..auth import require_api_key
10
+ from ..enrichment.derived_tables import DerivedTablesManager
11
+ from ..enrichment.jobs import CANONICAL_JOBS
12
+ from ..enrichment.orchestrator import EnrichmentOrchestrator
13
+ from ..sources.registry import REGISTRY
14
+ from ..core.state import get_db_connection
15
+ # Removed imports: canonicalization.mappers, ingestion.parsers, storage.raw.file_store, analytics.raw_queries
16
+ # Enrichment now reads directly from canonical table (ai_chat_messages) per architecture design
17
+
18
+ logger = logging.getLogger("topos.api.enrichment")
19
+
20
+ router = APIRouter()
21
+
22
+
23
+ def _url_classification_test_schema() -> Dict[str, Any]:
24
+ return {
25
+ "type": "object",
26
+ "required": ["url"],
27
+ "properties": {
28
+ "url": {
29
+ "type": "string",
30
+ "title": "URL",
31
+ "description": "Website URL to classify",
32
+ "example": "https://www.nytimes.com",
33
+ },
34
+ "title": {
35
+ "type": "string",
36
+ "title": "Page Title",
37
+ "description": "Optional page title for better classification context",
38
+ "example": "The New York Times - Breaking News",
39
+ },
40
+ },
41
+ }
42
+
43
+
44
+ async def _test_browser_visits_url_classification(*, data_packet: Dict[str, Any]) -> Dict[str, Any]:
45
+ from ..engine import Engine, build_url_classification_task
46
+
47
+ url = data_packet.get("url")
48
+ title = data_packet.get("title")
49
+ if not isinstance(url, str) or not url.strip():
50
+ raise HTTPException(status_code=400, detail="data_packet.url must be a non-empty string")
51
+ if title is not None and not isinstance(title, str):
52
+ raise HTTPException(status_code=400, detail="data_packet.title must be a string when provided")
53
+
54
+ task = build_url_classification_task(
55
+ task_id="test_url_cls",
56
+ url=url.strip(),
57
+ title=title,
58
+ )
59
+ engine = Engine()
60
+ result = await asyncio.to_thread(engine.run, task)
61
+ if result.status != "completed":
62
+ raise HTTPException(
63
+ status_code=502,
64
+ detail=result.error or f"Engine returned status {result.status}",
65
+ )
66
+ return {
67
+ "status": "ok",
68
+ "input": {"url": url, "title": title},
69
+ "output": result.output,
70
+ }
71
+
72
+
73
+ _RAW_SOURCE_TEST_HANDLERS = {
74
+ ("browser_visits", "url_classification"): _test_browser_visits_url_classification,
75
+ }
76
+
77
+ _RAW_SOURCE_TEST_SCHEMAS = {
78
+ ("browser_visits", "url_classification"): _url_classification_test_schema(),
79
+ }
80
+
81
+
82
+ async def _backfill_browser_visits_url_classification(
83
+ *,
84
+ db_conn,
85
+ only_missing: bool = True,
86
+ limit: Optional[int] = None,
87
+ ) -> Dict[str, Any]:
88
+ """Backfill URL classification for normalized browser visits raw table rows."""
89
+ from ..engine import Engine, build_url_classification_task
90
+ from ..storage.raw.browser_flat_tables import (
91
+ ensure_browser_url_classification_table,
92
+ write_browser_url_classification,
93
+ )
94
+ from ..enrichment.progress_bar import ProgressBar
95
+ from ..storage.raw.raw_tables_manager import RawTablesManager
96
+
97
+ source_table = "raw_chat_messages_browservisits"
98
+
99
+ logger.info(
100
+ "[PIPELINE:ENRICHMENT] Source backfill start: source=browser_visits enrichment=url_classification only_missing=%s limit=%s",
101
+ only_missing,
102
+ limit,
103
+ )
104
+
105
+ # Ensure/migrate the raw browser visits table to normalized-column schema first.
106
+ RawTablesManager(db_conn).ensure_raw_table(source_table)
107
+
108
+ # If source table does not exist yet, return an empty success result.
109
+ source_exists = db_conn.execute(
110
+ "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
111
+ (source_table,),
112
+ ).fetchone()
113
+ if not source_exists:
114
+ logger.info(
115
+ "[PIPELINE:ENRICHMENT] Source backfill complete: source table missing (%s)",
116
+ source_table,
117
+ )
118
+ return {
119
+ "rows_scanned": 0,
120
+ "rows_processed": 0,
121
+ "rows_skipped": 0,
122
+ "rows_failed": 0,
123
+ "errors": [],
124
+ }
125
+
126
+ ensure_browser_url_classification_table(db_conn)
127
+
128
+ params: List[Any] = []
129
+ if only_missing:
130
+ query = """
131
+ SELECT
132
+ (COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, '')) AS derived_record_id,
133
+ v.dataset_id,
134
+ v.url,
135
+ v.title
136
+ FROM raw_chat_messages_browservisits v
137
+ LEFT JOIN browser_url_classification c
138
+ ON c.source_table = 'browser_visits'
139
+ AND c.record_id = (COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, ''))
140
+ WHERE c.record_id IS NULL
141
+ ORDER BY v.visited_at ASC
142
+ """
143
+ else:
144
+ query = """
145
+ SELECT
146
+ (COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, '')) AS derived_record_id,
147
+ v.dataset_id,
148
+ v.url,
149
+ v.title
150
+ FROM raw_chat_messages_browservisits v
151
+ ORDER BY v.visited_at ASC
152
+ """
153
+ if isinstance(limit, int) and limit > 0:
154
+ query += " LIMIT ?"
155
+ params.append(limit)
156
+
157
+ rows = db_conn.execute(query, tuple(params)).fetchall()
158
+
159
+ processed = 0
160
+ skipped = 0
161
+ failed = 0
162
+ errors: List[Dict[str, Any]] = []
163
+
164
+ if rows:
165
+ with ProgressBar(total=len(rows), desc="url_classification backfill") as pbar:
166
+ for row in rows:
167
+ record_id = row[0]
168
+ dataset_id = row[1]
169
+ url = row[2]
170
+ title = row[3]
171
+ if not isinstance(url, str) or not url.strip():
172
+ skipped += 1
173
+ pbar.update(1)
174
+ continue
175
+
176
+ try:
177
+ task = build_url_classification_task(
178
+ task_id=f"backfill_url_{record_id}",
179
+ url=url,
180
+ title=title,
181
+ source_id="browser_visits",
182
+ record_ids=[record_id],
183
+ )
184
+ engine = Engine()
185
+ result = await asyncio.to_thread(engine.run, task)
186
+ if result.status != "completed":
187
+ failed += 1
188
+ errors.append({"record_id": record_id, "error": result.error or result.status})
189
+ continue
190
+ out = result.output
191
+ write_browser_url_classification(
192
+ db_conn,
193
+ source_table="browser_visits",
194
+ record_id=record_id,
195
+ dataset_id=dataset_id,
196
+ url=url,
197
+ title=title,
198
+ category=out.get("category"),
199
+ confidence=out.get("confidence"),
200
+ model_name=out.get("model"),
201
+ ensure_table=False,
202
+ log_write=False, # Avoid per-row log spam during bulk backfill
203
+ )
204
+ processed += 1
205
+ except Exception as exc: # noqa: BLE001
206
+ failed += 1
207
+ errors.append({"record_id": record_id, "error": str(exc)})
208
+ finally:
209
+ pbar.update(1)
210
+
211
+ summary = {
212
+ "rows_scanned": len(rows),
213
+ "rows_processed": processed,
214
+ "rows_skipped": skipped,
215
+ "rows_failed": failed,
216
+ "errors": errors[:100],
217
+ }
218
+ logger.info(
219
+ "[PIPELINE:ENRICHMENT] Source backfill complete: source=browser_visits enrichment=url_classification scanned=%d processed=%d skipped=%d failed=%d",
220
+ summary["rows_scanned"],
221
+ summary["rows_processed"],
222
+ summary["rows_skipped"],
223
+ summary["rows_failed"],
224
+ )
225
+ return summary
226
+
227
+
228
+ _RAW_SOURCE_BACKFILL_HANDLERS = {
229
+ ("browser_visits", "url_classification"): _backfill_browser_visits_url_classification,
230
+ }
231
+
232
+
233
+ def _get_enriched_message_ids(table_name: str, conn) -> set[str]:
234
+ """Get set of message_ids that have enrichment records in the given table."""
235
+ if not conn:
236
+ return set()
237
+ try:
238
+ cursor = conn.execute(f"SELECT DISTINCT message_id FROM {table_name}")
239
+ return {row[0] for row in cursor.fetchall()}
240
+ except Exception as e:
241
+ logger.warning("Failed to query enriched message IDs from %s: %s", table_name, e)
242
+ return set()
243
+
244
+
245
+ async def _find_unprocessed_messages(
246
+ source_id: str,
247
+ dataset_id: Optional[str] = None,
248
+ job_names: Optional[List[str]] = None,
249
+ ) -> List[Dict[str, Any]]:
250
+ """Find canonical messages that haven't been enriched yet.
251
+
252
+ This function reads directly from the ai_chat_messages table (canonical table)
253
+ as the source of truth, per the architecture design.
254
+
255
+ Args:
256
+ source_id: Source identifier
257
+ dataset_id: Optional dataset ID to filter by (extracts user_id for filtering)
258
+ job_names: List of enrichment job names to check
259
+
260
+ Returns:
261
+ List of canonical messages that need enrichment
262
+ """
263
+ # Get source definition
264
+ source_def = REGISTRY.get(source_id)
265
+ if not source_def:
266
+ raise ValueError(f"Source {source_id} not found")
267
+
268
+ if not source_def.canonical_enrichment_jobs:
269
+ return []
270
+
271
+ # Determine which jobs to check (default to all canonical enrichment jobs)
272
+ jobs_to_check = job_names or source_def.canonical_enrichment_jobs
273
+
274
+ # Get database connection
275
+ db_conn = get_db_connection()
276
+ if not db_conn:
277
+ logger.warning("No database connection available for enrichment")
278
+ return []
279
+
280
+ # Read canonical messages directly from ai_chat_messages table
281
+ # This is the source of truth per architecture design
282
+ try:
283
+ # Check if ai_chat_messages table exists
284
+ cursor = db_conn.execute("""
285
+ SELECT name FROM sqlite_master
286
+ WHERE type='table' AND name='ai_chat_messages'
287
+ """)
288
+ if not cursor.fetchone():
289
+ logger.info(
290
+ "ai_chat_messages table does not exist yet. "
291
+ "Wait for ingestion to complete (job status 'completed') before triggering enrichment."
292
+ )
293
+ return []
294
+
295
+ # Check if ai_chat_conversations table exists for dataset_id filtering
296
+ cursor = db_conn.execute("""
297
+ SELECT name FROM sqlite_master
298
+ WHERE type='table' AND name='ai_chat_conversations'
299
+ """)
300
+ has_conversations_table = cursor.fetchone() is not None
301
+
302
+ # Build query to read from canonical table
303
+ # First, check if we have messages for this source_id at all
304
+ msg_count_cursor = db_conn.execute("""
305
+ SELECT COUNT(*) FROM ai_chat_messages WHERE source_id = ?
306
+ """, (source_id,))
307
+ total_msgs = msg_count_cursor.fetchone()[0]
308
+ logger.debug("Debug: Total messages in ai_chat_messages for source_id=%s: %d", source_id, total_msgs)
309
+
310
+ # Debug: Check what source_ids actually exist in the messages table
311
+ all_sources_cursor = db_conn.execute("""
312
+ SELECT DISTINCT source_id, COUNT(*) as count FROM ai_chat_messages GROUP BY source_id
313
+ """)
314
+ all_sources = [(row[0], row[1]) for row in all_sources_cursor.fetchall()]
315
+ logger.debug("Debug: All source_ids in ai_chat_messages table: %s", all_sources if all_sources else "none")
316
+
317
+ # Debug: Check total message count regardless of source_id
318
+ total_all_cursor = db_conn.execute("SELECT COUNT(*) FROM ai_chat_messages")
319
+ total_all = total_all_cursor.fetchone()[0]
320
+ logger.debug("Debug: Total messages in ai_chat_messages (all sources): %d", total_all)
321
+
322
+ if has_conversations_table and dataset_id and total_msgs > 0:
323
+ # Join with conversations table to filter by owner_user_id
324
+ user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
325
+ logger.debug(
326
+ "Querying canonical messages: source_id=%s, dataset_id=%s, extracted_user_id=%s",
327
+ source_id,
328
+ dataset_id,
329
+ user_id,
330
+ )
331
+
332
+ # Check what owner_user_ids actually exist for this source
333
+ debug_cursor = db_conn.execute("""
334
+ SELECT DISTINCT c.owner_user_id, COUNT(*) as msg_count
335
+ FROM ai_chat_messages m
336
+ INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
337
+ WHERE m.source_id = ?
338
+ GROUP BY c.owner_user_id
339
+ """, (source_id,))
340
+ debug_rows = debug_cursor.fetchall()
341
+ logger.debug(
342
+ "Debug: Found conversations with owner_user_ids: %s",
343
+ [(row[0], row[1]) for row in debug_rows] if debug_rows else "none",
344
+ )
345
+
346
+ # Check what conversation_ids exist in messages
347
+ conv_cursor = db_conn.execute("""
348
+ SELECT DISTINCT conversation_id FROM ai_chat_messages WHERE source_id = ?
349
+ """, (source_id,))
350
+ conv_ids = [row[0] for row in conv_cursor.fetchall()]
351
+ logger.debug("Debug: Conversation IDs in messages: %s", conv_ids[:5] if conv_ids else "none")
352
+
353
+ # Check what conversations exist in conversations table
354
+ all_conv_cursor = db_conn.execute("""
355
+ SELECT conversation_id, owner_user_id FROM ai_chat_conversations
356
+ """)
357
+ all_convs = [(row[0], row[1]) for row in all_conv_cursor.fetchall()]
358
+ logger.debug("Debug: All conversations in table: %s", all_convs[:5] if all_convs else "none")
359
+
360
+ # Try query with user_id filter first
361
+ query = """
362
+ SELECT m.message_id, m.conversation_id, m.sender_type, m.sender_id,
363
+ m.event_at, m.content, m.content_rendered, m.metadata_json, m.sequence, m.source_id
364
+ FROM ai_chat_messages m
365
+ INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
366
+ WHERE m.source_id = ? AND c.owner_user_id = ?
367
+ ORDER BY m.event_at ASC
368
+ """
369
+ cursor = db_conn.execute(query, (source_id, user_id))
370
+ result_count = len(cursor.fetchall())
371
+ logger.debug("Debug: Query with user_id filter returned %d messages", result_count)
372
+
373
+ # If no results with user_id filter, fall back to source_id only (for local mode)
374
+ if result_count == 0:
375
+ logger.debug("Debug: No messages found with user_id filter, falling back to source_id only")
376
+ query = """
377
+ SELECT message_id, conversation_id, sender_type, sender_id,
378
+ event_at, content, content_rendered, metadata_json, sequence, source_id
379
+ FROM ai_chat_messages
380
+ WHERE source_id = ?
381
+ ORDER BY event_at ASC
382
+ """
383
+ cursor = db_conn.execute(query, (source_id,))
384
+ else:
385
+ # Re-execute the query since we consumed the cursor
386
+ cursor = db_conn.execute(query, (source_id, user_id))
387
+ else:
388
+ # Direct query without user filtering (fallback if conversations table doesn't exist or no dataset_id)
389
+ logger.debug(
390
+ "Querying canonical messages without user filter: source_id=%s, has_conversations_table=%s, dataset_id=%s",
391
+ source_id,
392
+ has_conversations_table,
393
+ dataset_id,
394
+ )
395
+ query = """
396
+ SELECT message_id, conversation_id, sender_type, sender_id,
397
+ event_at, content, content_rendered, metadata_json, sequence, source_id
398
+ FROM ai_chat_messages
399
+ WHERE source_id = ?
400
+ ORDER BY event_at ASC
401
+ """
402
+ cursor = db_conn.execute(query, (source_id,))
403
+
404
+ # Convert rows to dictionaries
405
+ canonical_messages: List[Dict[str, Any]] = []
406
+ for row in cursor.fetchall():
407
+ canonical_messages.append({
408
+ "message_id": row[0],
409
+ "conversation_id": row[1],
410
+ "sender_type": row[2],
411
+ "sender_id": row[3],
412
+ "event_at": row[4],
413
+ "content": row[5],
414
+ "content_rendered": row[6],
415
+ "metadata_json": row[7],
416
+ "sequence": row[8],
417
+ "source_id": row[9],
418
+ })
419
+
420
+ logger.debug(
421
+ "Found %d canonical messages for source_id=%s, dataset_id=%s",
422
+ len(canonical_messages),
423
+ source_id,
424
+ dataset_id,
425
+ )
426
+
427
+ except Exception as e:
428
+ logger.error("Failed to read canonical messages from ai_chat_messages table: %s", e)
429
+ return []
430
+
431
+ if not canonical_messages:
432
+ logger.debug("No canonical messages found for source_id=%s, dataset_id=%s", source_id, dataset_id)
433
+ return []
434
+
435
+ # Check which messages have already been enriched
436
+ # Get enriched message IDs for each job's table
437
+ enriched_ids: set[str] = set()
438
+ # Create a mapping from job name to table name using the job registry
439
+ job_to_table = {job.get_job_name(): job.get_derived_table() for job in CANONICAL_JOBS}
440
+
441
+ for job_name in jobs_to_check:
442
+ table_name = job_to_table.get(job_name)
443
+ if table_name:
444
+ enriched_ids.update(_get_enriched_message_ids(table_name, db_conn))
445
+ else:
446
+ logger.warning("Unknown enrichment job: %s (skipping check)", job_name)
447
+
448
+ # Filter to unprocessed messages
449
+ unprocessed = [
450
+ msg for msg in canonical_messages
451
+ if msg.get("message_id") not in enriched_ids
452
+ ]
453
+
454
+ logger.debug(
455
+ "Found %d unprocessed messages out of %d total canonical messages for source_id=%s",
456
+ len(unprocessed),
457
+ len(canonical_messages),
458
+ source_id,
459
+ )
460
+
461
+ return unprocessed
462
+
463
+
464
+ async def _process_enrichment_core(
465
+ source_id: str,
466
+ dataset_id: Optional[str] = None,
467
+ job_names: Optional[List[str]] = None,
468
+ force_reprocess: bool = False,
469
+ ) -> Dict[str, Any]:
470
+ """Core logic for processing enrichment (reusable from HTTP and WebSocket).
471
+
472
+ Args:
473
+ source_id: Source identifier
474
+ dataset_id: Optional dataset ID to filter by
475
+ job_names: Optional list of specific enrichment jobs to run
476
+ force_reprocess: If True, reprocess even if already enriched
477
+
478
+ Returns:
479
+ Processing results
480
+ """
481
+ # Get source definition
482
+ source_def = REGISTRY.get(source_id)
483
+ if not source_def:
484
+ raise ValueError(f"Source {source_id} not found")
485
+
486
+ # Determine which jobs to run
487
+ jobs_to_run = job_names or source_def.canonical_enrichment_jobs
488
+ if not jobs_to_run:
489
+ return {
490
+ "status": "ok",
491
+ "message": "No enrichment jobs configured for this source",
492
+ "messages_processed": 0,
493
+ "records_created": {},
494
+ }
495
+
496
+ # Get database connection
497
+ db_conn = get_db_connection()
498
+ if not db_conn:
499
+ return {
500
+ "status": "error",
501
+ "message": "Database connection not available",
502
+ "messages_processed": 0,
503
+ "records_created": {},
504
+ }
505
+
506
+ # Find unprocessed messages
507
+ if force_reprocess:
508
+ # For force reprocess, load all canonical messages regardless of enrichment status
509
+ # Read directly from canonical table (source of truth)
510
+ try:
511
+ # Check if ai_chat_messages table exists
512
+ cursor = db_conn.execute("""
513
+ SELECT name FROM sqlite_master
514
+ WHERE type='table' AND name='ai_chat_messages'
515
+ """)
516
+ if not cursor.fetchone():
517
+ return {
518
+ "status": "ok",
519
+ "message": "No canonical messages found",
520
+ "messages_processed": 0,
521
+ "records_created": {},
522
+ }
523
+
524
+ # Check if ai_chat_conversations table exists for dataset_id filtering
525
+ cursor = db_conn.execute("""
526
+ SELECT name FROM sqlite_master
527
+ WHERE type='table' AND name='ai_chat_conversations'
528
+ """)
529
+ has_conversations_table = cursor.fetchone() is not None
530
+
531
+ # Build query to read all canonical messages
532
+ if has_conversations_table and dataset_id:
533
+ user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
534
+ # Use INNER JOIN to ensure we only get messages with matching conversations
535
+ query = """
536
+ SELECT m.message_id, m.conversation_id, m.sender_type, m.sender_id,
537
+ m.event_at, m.content, m.content_rendered, m.metadata_json, m.sequence, m.source_id
538
+ FROM ai_chat_messages m
539
+ INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
540
+ WHERE m.source_id = ? AND c.owner_user_id = ?
541
+ ORDER BY m.event_at ASC
542
+ """
543
+ cursor = db_conn.execute(query, (source_id, user_id))
544
+ else:
545
+ query = """
546
+ SELECT message_id, conversation_id, sender_type, sender_id,
547
+ event_at, content, content_rendered, metadata_json, sequence, source_id
548
+ FROM ai_chat_messages
549
+ WHERE source_id = ?
550
+ ORDER BY event_at ASC
551
+ """
552
+ cursor = db_conn.execute(query, (source_id,))
553
+
554
+ # Convert rows to dictionaries
555
+ unprocessed_messages = []
556
+ for row in cursor.fetchall():
557
+ unprocessed_messages.append({
558
+ "message_id": row[0],
559
+ "conversation_id": row[1],
560
+ "sender_type": row[2],
561
+ "sender_id": row[3],
562
+ "event_at": row[4],
563
+ "content": row[5],
564
+ "content_rendered": row[6],
565
+ "metadata_json": row[7],
566
+ "sequence": row[8],
567
+ "source_id": row[9],
568
+ })
569
+ except Exception as e:
570
+ logger.error("Failed to read canonical messages for force_reprocess: %s", e)
571
+ return {
572
+ "status": "error",
573
+ "message": f"Failed to read canonical messages: {e}",
574
+ "messages_processed": 0,
575
+ "records_created": {},
576
+ }
577
+ else:
578
+ unprocessed_messages = await _find_unprocessed_messages(source_id, dataset_id, jobs_to_run)
579
+
580
+ if not unprocessed_messages:
581
+ return {
582
+ "status": "ok",
583
+ "message": "No unprocessed messages found",
584
+ "messages_processed": 0,
585
+ "records_created": {},
586
+ }
587
+
588
+ # Run enrichment
589
+ tables_manager = DerivedTablesManager(conn=db_conn)
590
+ orchestrator = EnrichmentOrchestrator(tables_manager=tables_manager)
591
+
592
+ logger.info(
593
+ "[PIPELINE:ENRICHMENT] %s: Manual enrichment triggered: source_id=%s, messages=%d, jobs=%s",
594
+ orchestrator,
595
+ source_id,
596
+ len(unprocessed_messages),
597
+ jobs_to_run,
598
+ )
599
+
600
+ # Define progress callback to update progress during execution
601
+ progress_callback = None
602
+ try:
603
+ from ..enrichment.progress import get_progress
604
+ # Try to get progress object if it exists (created by handler)
605
+ progress_obj = get_progress(source_id) # Use source_id as fallback lookup
606
+ if progress_obj:
607
+ def progress_callback(
608
+ processed_count: int,
609
+ total_count: int,
610
+ job_name: str,
611
+ job_percent: float,
612
+ current_job_progress: float,
613
+ ):
614
+ """Update progress as jobs execute."""
615
+ estimated_messages_processed = int((job_percent / 100) * total_count)
616
+ jobs_complete = int((job_percent / 100) * len(jobs_to_run))
617
+ progress_obj.update(
618
+ messages_processed=estimated_messages_processed,
619
+ messages_skipped=0,
620
+ current_job_name=job_name,
621
+ current_job_progress_percent=current_job_progress,
622
+ jobs_complete=jobs_complete,
623
+ jobs_total=len(jobs_to_run),
624
+ )
625
+ except Exception:
626
+ pass # Progress callback is optional
627
+
628
+ enrichment_result = await orchestrator.run_canonical(
629
+ unprocessed_messages,
630
+ job_names=jobs_to_run,
631
+ progress_callback=progress_callback,
632
+ )
633
+
634
+ return {
635
+ "status": "ok",
636
+ "source_id": source_id,
637
+ "messages_processed": len(unprocessed_messages),
638
+ "jobs_run": enrichment_result.get("jobs_run", 0),
639
+ "records_created": enrichment_result.get("records_created", {}),
640
+ "errors": enrichment_result.get("errors", []),
641
+ }
642
+
643
+
644
+ async def _get_enrichment_status_core(
645
+ source_id: str,
646
+ dataset_id: Optional[str] = None,
647
+ ) -> Dict[str, Any]:
648
+ """Core logic for getting enrichment status (reusable from HTTP and WebSocket).
649
+
650
+ This function reads directly from the ai_chat_messages table (canonical table)
651
+ as the source of truth, per the architecture design.
652
+
653
+ Returns:
654
+ Status information including counts of processed/unprocessed messages
655
+ """
656
+ source_def = REGISTRY.get(source_id)
657
+ if not source_def:
658
+ raise ValueError(f"Source {source_id} not found")
659
+
660
+ # Get database connection
661
+ db_conn = get_db_connection()
662
+ if not db_conn:
663
+ return {
664
+ "status": "error",
665
+ "source_id": source_id,
666
+ "total_messages": 0,
667
+ "processed_messages": 0,
668
+ "unprocessed_messages": 0,
669
+ "enrichment_jobs": source_def.canonical_enrichment_jobs,
670
+ "enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
671
+ "message": "Database connection not available",
672
+ }
673
+
674
+ # Read canonical messages directly from ai_chat_messages table
675
+ try:
676
+ # Check if ai_chat_messages table exists
677
+ cursor = db_conn.execute("""
678
+ SELECT name FROM sqlite_master
679
+ WHERE type='table' AND name='ai_chat_messages'
680
+ """)
681
+ if not cursor.fetchone():
682
+ return {
683
+ "status": "ok",
684
+ "source_id": source_id,
685
+ "total_messages": 0,
686
+ "processed_messages": 0,
687
+ "unprocessed_messages": 0,
688
+ "enrichment_jobs": source_def.canonical_enrichment_jobs,
689
+ "enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
690
+ "message": "Canonical table does not exist yet",
691
+ }
692
+
693
+ # Check if ai_chat_conversations table exists for dataset_id filtering
694
+ cursor = db_conn.execute("""
695
+ SELECT name FROM sqlite_master
696
+ WHERE type='table' AND name='ai_chat_conversations'
697
+ """)
698
+ has_conversations_table = cursor.fetchone() is not None
699
+
700
+ # Build query to count messages from canonical table
701
+ if has_conversations_table and dataset_id:
702
+ # Join with conversations table to filter by owner_user_id
703
+ user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
704
+ query = """
705
+ SELECT COUNT(*)
706
+ FROM ai_chat_messages m
707
+ LEFT JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
708
+ WHERE m.source_id = ? AND c.owner_user_id = ?
709
+ """
710
+ cursor = db_conn.execute(query, (source_id, user_id))
711
+ else:
712
+ # Direct query without user filtering
713
+ query = "SELECT COUNT(*) FROM ai_chat_messages WHERE source_id = ?"
714
+ cursor = db_conn.execute(query, (source_id,))
715
+
716
+ total = cursor.fetchone()[0]
717
+
718
+ except Exception as e:
719
+ logger.error("Failed to read canonical messages from ai_chat_messages table: %s", e)
720
+ return {
721
+ "status": "error",
722
+ "source_id": source_id,
723
+ "total_messages": 0,
724
+ "processed_messages": 0,
725
+ "unprocessed_messages": 0,
726
+ "enrichment_jobs": source_def.canonical_enrichment_jobs,
727
+ "enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
728
+ "message": f"Error reading canonical table: {e}",
729
+ }
730
+
731
+ # Get unprocessed messages count (reuse the logic from _find_unprocessed_messages)
732
+ unprocessed = await _find_unprocessed_messages(source_id, dataset_id)
733
+ unprocessed_count = len(unprocessed)
734
+ processed_count = total - unprocessed_count
735
+
736
+ return {
737
+ "status": "ok",
738
+ "source_id": source_id,
739
+ "total_messages": total,
740
+ "processed_messages": processed_count,
741
+ "unprocessed_messages": unprocessed_count,
742
+ "enrichment_jobs": source_def.canonical_enrichment_jobs,
743
+ "enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
744
+ }
745
+
746
+
747
+ @router.post("/enrichment/process", dependencies=[Depends(require_api_key)])
748
+ async def process_enrichment(
749
+ source_id: str = Body(...),
750
+ dataset_id: Optional[str] = Body(None),
751
+ job_names: Optional[List[str]] = Body(None),
752
+ force_reprocess: bool = Body(False),
753
+ ) -> Dict[str, Any]:
754
+ """Manually trigger enrichment for unprocessed messages.
755
+
756
+ Args:
757
+ source_id: Source identifier
758
+ dataset_id: Optional dataset ID to filter by
759
+ job_names: Optional list of specific enrichment jobs to run
760
+ force_reprocess: If True, reprocess even if already enriched
761
+
762
+ Returns:
763
+ Processing results
764
+ """
765
+ try:
766
+ return await _process_enrichment_core(
767
+ source_id=source_id,
768
+ dataset_id=dataset_id,
769
+ job_names=job_names,
770
+ force_reprocess=force_reprocess,
771
+ )
772
+ except ValueError as e:
773
+ raise HTTPException(status_code=404, detail=str(e))
774
+ except Exception as e:
775
+ logger.error("Manual enrichment failed: %s", e, exc_info=True)
776
+ raise HTTPException(status_code=500, detail=str(e))
777
+
778
+
779
+ @router.get("/enrichment/status", dependencies=[Depends(require_api_key)])
780
+ async def get_processing_status(
781
+ source_id: str,
782
+ dataset_id: Optional[str] = None,
783
+ ) -> Dict[str, Any]:
784
+ """Get enrichment status for a source.
785
+
786
+ Returns:
787
+ Status information including counts of processed/unprocessed messages
788
+ """
789
+ try:
790
+ return await _get_enrichment_status_core(
791
+ source_id=source_id,
792
+ dataset_id=dataset_id,
793
+ )
794
+ except ValueError as e:
795
+ raise HTTPException(status_code=404, detail=str(e))
796
+ except Exception as e:
797
+ logger.error("Failed to get enrichment status: %s", e, exc_info=True)
798
+ raise HTTPException(status_code=500, detail=str(e))
799
+
800
+
801
+ @router.get(
802
+ "/sources/{source_id}/enrichments",
803
+ dependencies=[Depends(require_api_key)],
804
+ )
805
+ async def list_source_enrichments(source_id: str) -> Dict[str, Any]:
806
+ """List enrichment capabilities for a specific source."""
807
+ source_def = REGISTRY.get(source_id)
808
+ if not source_def:
809
+ raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
810
+
811
+ raw_jobs = list(getattr(source_def, "raw_enrichment_jobs", []) or [])
812
+ canonical_jobs = list(getattr(source_def, "canonical_enrichment_jobs", []) or [])
813
+ implemented_backfills = [
814
+ enrichment_name
815
+ for (sid, enrichment_name) in _RAW_SOURCE_BACKFILL_HANDLERS.keys()
816
+ if sid == source_id
817
+ ]
818
+ implemented_backfills.sort()
819
+ capabilities: List[Dict[str, Any]] = []
820
+ for name in raw_jobs:
821
+ key = (source_id, name)
822
+ capabilities.append(
823
+ {
824
+ "name": name,
825
+ "supports_backfill": key in _RAW_SOURCE_BACKFILL_HANDLERS,
826
+ "supports_test": key in _RAW_SOURCE_TEST_HANDLERS,
827
+ "test_input_schema": _RAW_SOURCE_TEST_SCHEMAS.get(key),
828
+ }
829
+ )
830
+
831
+ return {
832
+ "status": "ok",
833
+ "source_id": source_id,
834
+ "ingestion_trigger": getattr(source_def, "ingestion_trigger", "automatic"),
835
+ "enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
836
+ "raw_enrichments": raw_jobs,
837
+ "raw_enrichment_capabilities": capabilities,
838
+ "canonical_enrichments": canonical_jobs,
839
+ "raw_backfill_supported": implemented_backfills,
840
+ }
841
+
842
+
843
+ @router.post(
844
+ "/sources/{source_id}/enrichments/{enrichment_name}/backfill",
845
+ dependencies=[Depends(require_api_key)],
846
+ )
847
+ async def backfill_source_enrichment(
848
+ source_id: str,
849
+ enrichment_name: str,
850
+ only_missing: bool = Body(True),
851
+ limit: Optional[int] = Body(None),
852
+ ) -> Dict[str, Any]:
853
+ """Backfill an enrichment for an ingestion source's existing rows.
854
+
855
+ This endpoint is source-scoped (raw/source layer), separate from canonical
856
+ message enrichment endpoints.
857
+ """
858
+ source_def = REGISTRY.get(source_id)
859
+ if not source_def:
860
+ raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
861
+
862
+ configured_raw_jobs = set(getattr(source_def, "raw_enrichment_jobs", []) or [])
863
+ if enrichment_name not in configured_raw_jobs:
864
+ raise HTTPException(
865
+ status_code=400,
866
+ detail=(
867
+ f"Enrichment '{enrichment_name}' is not configured for source '{source_id}'. "
868
+ f"Configured raw enrichments: {sorted(configured_raw_jobs)}"
869
+ ),
870
+ )
871
+
872
+ handler = _RAW_SOURCE_BACKFILL_HANDLERS.get((source_id, enrichment_name))
873
+ if not handler:
874
+ raise HTTPException(
875
+ status_code=501,
876
+ detail=f"Backfill for source='{source_id}' enrichment='{enrichment_name}' is not implemented",
877
+ )
878
+
879
+ db_conn = get_db_connection()
880
+ if not db_conn:
881
+ raise HTTPException(status_code=503, detail="Database connection not available")
882
+
883
+ try:
884
+ result = await handler(
885
+ db_conn=db_conn,
886
+ only_missing=only_missing,
887
+ limit=limit,
888
+ )
889
+ return {
890
+ "status": "ok",
891
+ "source_id": source_id,
892
+ "enrichment_name": enrichment_name,
893
+ "only_missing": only_missing,
894
+ "limit": limit,
895
+ **result,
896
+ }
897
+ except HTTPException:
898
+ raise
899
+ except Exception as exc: # noqa: BLE001
900
+ logger.error(
901
+ "Source enrichment backfill failed: source=%s enrichment=%s error=%s",
902
+ source_id,
903
+ enrichment_name,
904
+ exc,
905
+ exc_info=True,
906
+ )
907
+ raise HTTPException(status_code=500, detail=str(exc))
908
+
909
+
910
+ @router.post(
911
+ "/sources/{source_id}/enrichments/{enrichment_name}/test",
912
+ dependencies=[Depends(require_api_key)],
913
+ )
914
+ async def test_source_enrichment(
915
+ source_id: str,
916
+ enrichment_name: str,
917
+ data_packet: Dict[str, Any] = Body(...),
918
+ ) -> Dict[str, Any]:
919
+ """Test-run a source enrichment against a provided data packet."""
920
+ source_def = REGISTRY.get(source_id)
921
+ if not source_def:
922
+ raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
923
+
924
+ configured_raw_jobs = set(getattr(source_def, "raw_enrichment_jobs", []) or [])
925
+ if enrichment_name not in configured_raw_jobs:
926
+ raise HTTPException(
927
+ status_code=400,
928
+ detail=(
929
+ f"Enrichment '{enrichment_name}' is not configured for source '{source_id}'. "
930
+ f"Configured raw enrichments: {sorted(configured_raw_jobs)}"
931
+ ),
932
+ )
933
+
934
+ handler = _RAW_SOURCE_TEST_HANDLERS.get((source_id, enrichment_name))
935
+ if not handler:
936
+ raise HTTPException(
937
+ status_code=501,
938
+ detail=f"Test for source='{source_id}' enrichment='{enrichment_name}' is not implemented",
939
+ )
940
+
941
+ try:
942
+ result = await handler(data_packet=data_packet)
943
+ return {
944
+ "status": "ok",
945
+ "source_id": source_id,
946
+ "enrichment_name": enrichment_name,
947
+ **result,
948
+ }
949
+ except HTTPException:
950
+ raise
951
+ except Exception as exc: # noqa: BLE001
952
+ logger.error(
953
+ "Source enrichment test failed: source=%s enrichment=%s error=%s",
954
+ source_id,
955
+ enrichment_name,
956
+ exc,
957
+ exc_info=True,
958
+ )
959
+ raise HTTPException(status_code=500, detail=str(exc))