topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
topos/api/enrichment.py
ADDED
|
@@ -0,0 +1,959 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from fastapi import APIRouter, Body, Depends, HTTPException
|
|
8
|
+
|
|
9
|
+
from ..auth import require_api_key
|
|
10
|
+
from ..enrichment.derived_tables import DerivedTablesManager
|
|
11
|
+
from ..enrichment.jobs import CANONICAL_JOBS
|
|
12
|
+
from ..enrichment.orchestrator import EnrichmentOrchestrator
|
|
13
|
+
from ..sources.registry import REGISTRY
|
|
14
|
+
from ..core.state import get_db_connection
|
|
15
|
+
# Removed imports: canonicalization.mappers, ingestion.parsers, storage.raw.file_store, analytics.raw_queries
|
|
16
|
+
# Enrichment now reads directly from canonical table (ai_chat_messages) per architecture design
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("topos.api.enrichment")
|
|
19
|
+
|
|
20
|
+
router = APIRouter()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _url_classification_test_schema() -> Dict[str, Any]:
|
|
24
|
+
return {
|
|
25
|
+
"type": "object",
|
|
26
|
+
"required": ["url"],
|
|
27
|
+
"properties": {
|
|
28
|
+
"url": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"title": "URL",
|
|
31
|
+
"description": "Website URL to classify",
|
|
32
|
+
"example": "https://www.nytimes.com",
|
|
33
|
+
},
|
|
34
|
+
"title": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"title": "Page Title",
|
|
37
|
+
"description": "Optional page title for better classification context",
|
|
38
|
+
"example": "The New York Times - Breaking News",
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def _test_browser_visits_url_classification(*, data_packet: Dict[str, Any]) -> Dict[str, Any]:
|
|
45
|
+
from ..engine import Engine, build_url_classification_task
|
|
46
|
+
|
|
47
|
+
url = data_packet.get("url")
|
|
48
|
+
title = data_packet.get("title")
|
|
49
|
+
if not isinstance(url, str) or not url.strip():
|
|
50
|
+
raise HTTPException(status_code=400, detail="data_packet.url must be a non-empty string")
|
|
51
|
+
if title is not None and not isinstance(title, str):
|
|
52
|
+
raise HTTPException(status_code=400, detail="data_packet.title must be a string when provided")
|
|
53
|
+
|
|
54
|
+
task = build_url_classification_task(
|
|
55
|
+
task_id="test_url_cls",
|
|
56
|
+
url=url.strip(),
|
|
57
|
+
title=title,
|
|
58
|
+
)
|
|
59
|
+
engine = Engine()
|
|
60
|
+
result = await asyncio.to_thread(engine.run, task)
|
|
61
|
+
if result.status != "completed":
|
|
62
|
+
raise HTTPException(
|
|
63
|
+
status_code=502,
|
|
64
|
+
detail=result.error or f"Engine returned status {result.status}",
|
|
65
|
+
)
|
|
66
|
+
return {
|
|
67
|
+
"status": "ok",
|
|
68
|
+
"input": {"url": url, "title": title},
|
|
69
|
+
"output": result.output,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
_RAW_SOURCE_TEST_HANDLERS = {
|
|
74
|
+
("browser_visits", "url_classification"): _test_browser_visits_url_classification,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
_RAW_SOURCE_TEST_SCHEMAS = {
|
|
78
|
+
("browser_visits", "url_classification"): _url_classification_test_schema(),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
async def _backfill_browser_visits_url_classification(
|
|
83
|
+
*,
|
|
84
|
+
db_conn,
|
|
85
|
+
only_missing: bool = True,
|
|
86
|
+
limit: Optional[int] = None,
|
|
87
|
+
) -> Dict[str, Any]:
|
|
88
|
+
"""Backfill URL classification for normalized browser visits raw table rows."""
|
|
89
|
+
from ..engine import Engine, build_url_classification_task
|
|
90
|
+
from ..storage.raw.browser_flat_tables import (
|
|
91
|
+
ensure_browser_url_classification_table,
|
|
92
|
+
write_browser_url_classification,
|
|
93
|
+
)
|
|
94
|
+
from ..enrichment.progress_bar import ProgressBar
|
|
95
|
+
from ..storage.raw.raw_tables_manager import RawTablesManager
|
|
96
|
+
|
|
97
|
+
source_table = "raw_chat_messages_browservisits"
|
|
98
|
+
|
|
99
|
+
logger.info(
|
|
100
|
+
"[PIPELINE:ENRICHMENT] Source backfill start: source=browser_visits enrichment=url_classification only_missing=%s limit=%s",
|
|
101
|
+
only_missing,
|
|
102
|
+
limit,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Ensure/migrate the raw browser visits table to normalized-column schema first.
|
|
106
|
+
RawTablesManager(db_conn).ensure_raw_table(source_table)
|
|
107
|
+
|
|
108
|
+
# If source table does not exist yet, return an empty success result.
|
|
109
|
+
source_exists = db_conn.execute(
|
|
110
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
|
111
|
+
(source_table,),
|
|
112
|
+
).fetchone()
|
|
113
|
+
if not source_exists:
|
|
114
|
+
logger.info(
|
|
115
|
+
"[PIPELINE:ENRICHMENT] Source backfill complete: source table missing (%s)",
|
|
116
|
+
source_table,
|
|
117
|
+
)
|
|
118
|
+
return {
|
|
119
|
+
"rows_scanned": 0,
|
|
120
|
+
"rows_processed": 0,
|
|
121
|
+
"rows_skipped": 0,
|
|
122
|
+
"rows_failed": 0,
|
|
123
|
+
"errors": [],
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
ensure_browser_url_classification_table(db_conn)
|
|
127
|
+
|
|
128
|
+
params: List[Any] = []
|
|
129
|
+
if only_missing:
|
|
130
|
+
query = """
|
|
131
|
+
SELECT
|
|
132
|
+
(COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, '')) AS derived_record_id,
|
|
133
|
+
v.dataset_id,
|
|
134
|
+
v.url,
|
|
135
|
+
v.title
|
|
136
|
+
FROM raw_chat_messages_browservisits v
|
|
137
|
+
LEFT JOIN browser_url_classification c
|
|
138
|
+
ON c.source_table = 'browser_visits'
|
|
139
|
+
AND c.record_id = (COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, ''))
|
|
140
|
+
WHERE c.record_id IS NULL
|
|
141
|
+
ORDER BY v.visited_at ASC
|
|
142
|
+
"""
|
|
143
|
+
else:
|
|
144
|
+
query = """
|
|
145
|
+
SELECT
|
|
146
|
+
(COALESCE(v.url, '') || '_' || COALESCE(v.visited_at, '')) AS derived_record_id,
|
|
147
|
+
v.dataset_id,
|
|
148
|
+
v.url,
|
|
149
|
+
v.title
|
|
150
|
+
FROM raw_chat_messages_browservisits v
|
|
151
|
+
ORDER BY v.visited_at ASC
|
|
152
|
+
"""
|
|
153
|
+
if isinstance(limit, int) and limit > 0:
|
|
154
|
+
query += " LIMIT ?"
|
|
155
|
+
params.append(limit)
|
|
156
|
+
|
|
157
|
+
rows = db_conn.execute(query, tuple(params)).fetchall()
|
|
158
|
+
|
|
159
|
+
processed = 0
|
|
160
|
+
skipped = 0
|
|
161
|
+
failed = 0
|
|
162
|
+
errors: List[Dict[str, Any]] = []
|
|
163
|
+
|
|
164
|
+
if rows:
|
|
165
|
+
with ProgressBar(total=len(rows), desc="url_classification backfill") as pbar:
|
|
166
|
+
for row in rows:
|
|
167
|
+
record_id = row[0]
|
|
168
|
+
dataset_id = row[1]
|
|
169
|
+
url = row[2]
|
|
170
|
+
title = row[3]
|
|
171
|
+
if not isinstance(url, str) or not url.strip():
|
|
172
|
+
skipped += 1
|
|
173
|
+
pbar.update(1)
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
task = build_url_classification_task(
|
|
178
|
+
task_id=f"backfill_url_{record_id}",
|
|
179
|
+
url=url,
|
|
180
|
+
title=title,
|
|
181
|
+
source_id="browser_visits",
|
|
182
|
+
record_ids=[record_id],
|
|
183
|
+
)
|
|
184
|
+
engine = Engine()
|
|
185
|
+
result = await asyncio.to_thread(engine.run, task)
|
|
186
|
+
if result.status != "completed":
|
|
187
|
+
failed += 1
|
|
188
|
+
errors.append({"record_id": record_id, "error": result.error or result.status})
|
|
189
|
+
continue
|
|
190
|
+
out = result.output
|
|
191
|
+
write_browser_url_classification(
|
|
192
|
+
db_conn,
|
|
193
|
+
source_table="browser_visits",
|
|
194
|
+
record_id=record_id,
|
|
195
|
+
dataset_id=dataset_id,
|
|
196
|
+
url=url,
|
|
197
|
+
title=title,
|
|
198
|
+
category=out.get("category"),
|
|
199
|
+
confidence=out.get("confidence"),
|
|
200
|
+
model_name=out.get("model"),
|
|
201
|
+
ensure_table=False,
|
|
202
|
+
log_write=False, # Avoid per-row log spam during bulk backfill
|
|
203
|
+
)
|
|
204
|
+
processed += 1
|
|
205
|
+
except Exception as exc: # noqa: BLE001
|
|
206
|
+
failed += 1
|
|
207
|
+
errors.append({"record_id": record_id, "error": str(exc)})
|
|
208
|
+
finally:
|
|
209
|
+
pbar.update(1)
|
|
210
|
+
|
|
211
|
+
summary = {
|
|
212
|
+
"rows_scanned": len(rows),
|
|
213
|
+
"rows_processed": processed,
|
|
214
|
+
"rows_skipped": skipped,
|
|
215
|
+
"rows_failed": failed,
|
|
216
|
+
"errors": errors[:100],
|
|
217
|
+
}
|
|
218
|
+
logger.info(
|
|
219
|
+
"[PIPELINE:ENRICHMENT] Source backfill complete: source=browser_visits enrichment=url_classification scanned=%d processed=%d skipped=%d failed=%d",
|
|
220
|
+
summary["rows_scanned"],
|
|
221
|
+
summary["rows_processed"],
|
|
222
|
+
summary["rows_skipped"],
|
|
223
|
+
summary["rows_failed"],
|
|
224
|
+
)
|
|
225
|
+
return summary
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
_RAW_SOURCE_BACKFILL_HANDLERS = {
|
|
229
|
+
("browser_visits", "url_classification"): _backfill_browser_visits_url_classification,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _get_enriched_message_ids(table_name: str, conn) -> set[str]:
|
|
234
|
+
"""Get set of message_ids that have enrichment records in the given table."""
|
|
235
|
+
if not conn:
|
|
236
|
+
return set()
|
|
237
|
+
try:
|
|
238
|
+
cursor = conn.execute(f"SELECT DISTINCT message_id FROM {table_name}")
|
|
239
|
+
return {row[0] for row in cursor.fetchall()}
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.warning("Failed to query enriched message IDs from %s: %s", table_name, e)
|
|
242
|
+
return set()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
async def _find_unprocessed_messages(
|
|
246
|
+
source_id: str,
|
|
247
|
+
dataset_id: Optional[str] = None,
|
|
248
|
+
job_names: Optional[List[str]] = None,
|
|
249
|
+
) -> List[Dict[str, Any]]:
|
|
250
|
+
"""Find canonical messages that haven't been enriched yet.
|
|
251
|
+
|
|
252
|
+
This function reads directly from the ai_chat_messages table (canonical table)
|
|
253
|
+
as the source of truth, per the architecture design.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
source_id: Source identifier
|
|
257
|
+
dataset_id: Optional dataset ID to filter by (extracts user_id for filtering)
|
|
258
|
+
job_names: List of enrichment job names to check
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List of canonical messages that need enrichment
|
|
262
|
+
"""
|
|
263
|
+
# Get source definition
|
|
264
|
+
source_def = REGISTRY.get(source_id)
|
|
265
|
+
if not source_def:
|
|
266
|
+
raise ValueError(f"Source {source_id} not found")
|
|
267
|
+
|
|
268
|
+
if not source_def.canonical_enrichment_jobs:
|
|
269
|
+
return []
|
|
270
|
+
|
|
271
|
+
# Determine which jobs to check (default to all canonical enrichment jobs)
|
|
272
|
+
jobs_to_check = job_names or source_def.canonical_enrichment_jobs
|
|
273
|
+
|
|
274
|
+
# Get database connection
|
|
275
|
+
db_conn = get_db_connection()
|
|
276
|
+
if not db_conn:
|
|
277
|
+
logger.warning("No database connection available for enrichment")
|
|
278
|
+
return []
|
|
279
|
+
|
|
280
|
+
# Read canonical messages directly from ai_chat_messages table
|
|
281
|
+
# This is the source of truth per architecture design
|
|
282
|
+
try:
|
|
283
|
+
# Check if ai_chat_messages table exists
|
|
284
|
+
cursor = db_conn.execute("""
|
|
285
|
+
SELECT name FROM sqlite_master
|
|
286
|
+
WHERE type='table' AND name='ai_chat_messages'
|
|
287
|
+
""")
|
|
288
|
+
if not cursor.fetchone():
|
|
289
|
+
logger.info(
|
|
290
|
+
"ai_chat_messages table does not exist yet. "
|
|
291
|
+
"Wait for ingestion to complete (job status 'completed') before triggering enrichment."
|
|
292
|
+
)
|
|
293
|
+
return []
|
|
294
|
+
|
|
295
|
+
# Check if ai_chat_conversations table exists for dataset_id filtering
|
|
296
|
+
cursor = db_conn.execute("""
|
|
297
|
+
SELECT name FROM sqlite_master
|
|
298
|
+
WHERE type='table' AND name='ai_chat_conversations'
|
|
299
|
+
""")
|
|
300
|
+
has_conversations_table = cursor.fetchone() is not None
|
|
301
|
+
|
|
302
|
+
# Build query to read from canonical table
|
|
303
|
+
# First, check if we have messages for this source_id at all
|
|
304
|
+
msg_count_cursor = db_conn.execute("""
|
|
305
|
+
SELECT COUNT(*) FROM ai_chat_messages WHERE source_id = ?
|
|
306
|
+
""", (source_id,))
|
|
307
|
+
total_msgs = msg_count_cursor.fetchone()[0]
|
|
308
|
+
logger.debug("Debug: Total messages in ai_chat_messages for source_id=%s: %d", source_id, total_msgs)
|
|
309
|
+
|
|
310
|
+
# Debug: Check what source_ids actually exist in the messages table
|
|
311
|
+
all_sources_cursor = db_conn.execute("""
|
|
312
|
+
SELECT DISTINCT source_id, COUNT(*) as count FROM ai_chat_messages GROUP BY source_id
|
|
313
|
+
""")
|
|
314
|
+
all_sources = [(row[0], row[1]) for row in all_sources_cursor.fetchall()]
|
|
315
|
+
logger.debug("Debug: All source_ids in ai_chat_messages table: %s", all_sources if all_sources else "none")
|
|
316
|
+
|
|
317
|
+
# Debug: Check total message count regardless of source_id
|
|
318
|
+
total_all_cursor = db_conn.execute("SELECT COUNT(*) FROM ai_chat_messages")
|
|
319
|
+
total_all = total_all_cursor.fetchone()[0]
|
|
320
|
+
logger.debug("Debug: Total messages in ai_chat_messages (all sources): %d", total_all)
|
|
321
|
+
|
|
322
|
+
if has_conversations_table and dataset_id and total_msgs > 0:
|
|
323
|
+
# Join with conversations table to filter by owner_user_id
|
|
324
|
+
user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
|
|
325
|
+
logger.debug(
|
|
326
|
+
"Querying canonical messages: source_id=%s, dataset_id=%s, extracted_user_id=%s",
|
|
327
|
+
source_id,
|
|
328
|
+
dataset_id,
|
|
329
|
+
user_id,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Check what owner_user_ids actually exist for this source
|
|
333
|
+
debug_cursor = db_conn.execute("""
|
|
334
|
+
SELECT DISTINCT c.owner_user_id, COUNT(*) as msg_count
|
|
335
|
+
FROM ai_chat_messages m
|
|
336
|
+
INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
|
|
337
|
+
WHERE m.source_id = ?
|
|
338
|
+
GROUP BY c.owner_user_id
|
|
339
|
+
""", (source_id,))
|
|
340
|
+
debug_rows = debug_cursor.fetchall()
|
|
341
|
+
logger.debug(
|
|
342
|
+
"Debug: Found conversations with owner_user_ids: %s",
|
|
343
|
+
[(row[0], row[1]) for row in debug_rows] if debug_rows else "none",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Check what conversation_ids exist in messages
|
|
347
|
+
conv_cursor = db_conn.execute("""
|
|
348
|
+
SELECT DISTINCT conversation_id FROM ai_chat_messages WHERE source_id = ?
|
|
349
|
+
""", (source_id,))
|
|
350
|
+
conv_ids = [row[0] for row in conv_cursor.fetchall()]
|
|
351
|
+
logger.debug("Debug: Conversation IDs in messages: %s", conv_ids[:5] if conv_ids else "none")
|
|
352
|
+
|
|
353
|
+
# Check what conversations exist in conversations table
|
|
354
|
+
all_conv_cursor = db_conn.execute("""
|
|
355
|
+
SELECT conversation_id, owner_user_id FROM ai_chat_conversations
|
|
356
|
+
""")
|
|
357
|
+
all_convs = [(row[0], row[1]) for row in all_conv_cursor.fetchall()]
|
|
358
|
+
logger.debug("Debug: All conversations in table: %s", all_convs[:5] if all_convs else "none")
|
|
359
|
+
|
|
360
|
+
# Try query with user_id filter first
|
|
361
|
+
query = """
|
|
362
|
+
SELECT m.message_id, m.conversation_id, m.sender_type, m.sender_id,
|
|
363
|
+
m.event_at, m.content, m.content_rendered, m.metadata_json, m.sequence, m.source_id
|
|
364
|
+
FROM ai_chat_messages m
|
|
365
|
+
INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
|
|
366
|
+
WHERE m.source_id = ? AND c.owner_user_id = ?
|
|
367
|
+
ORDER BY m.event_at ASC
|
|
368
|
+
"""
|
|
369
|
+
cursor = db_conn.execute(query, (source_id, user_id))
|
|
370
|
+
result_count = len(cursor.fetchall())
|
|
371
|
+
logger.debug("Debug: Query with user_id filter returned %d messages", result_count)
|
|
372
|
+
|
|
373
|
+
# If no results with user_id filter, fall back to source_id only (for local mode)
|
|
374
|
+
if result_count == 0:
|
|
375
|
+
logger.debug("Debug: No messages found with user_id filter, falling back to source_id only")
|
|
376
|
+
query = """
|
|
377
|
+
SELECT message_id, conversation_id, sender_type, sender_id,
|
|
378
|
+
event_at, content, content_rendered, metadata_json, sequence, source_id
|
|
379
|
+
FROM ai_chat_messages
|
|
380
|
+
WHERE source_id = ?
|
|
381
|
+
ORDER BY event_at ASC
|
|
382
|
+
"""
|
|
383
|
+
cursor = db_conn.execute(query, (source_id,))
|
|
384
|
+
else:
|
|
385
|
+
# Re-execute the query since we consumed the cursor
|
|
386
|
+
cursor = db_conn.execute(query, (source_id, user_id))
|
|
387
|
+
else:
|
|
388
|
+
# Direct query without user filtering (fallback if conversations table doesn't exist or no dataset_id)
|
|
389
|
+
logger.debug(
|
|
390
|
+
"Querying canonical messages without user filter: source_id=%s, has_conversations_table=%s, dataset_id=%s",
|
|
391
|
+
source_id,
|
|
392
|
+
has_conversations_table,
|
|
393
|
+
dataset_id,
|
|
394
|
+
)
|
|
395
|
+
query = """
|
|
396
|
+
SELECT message_id, conversation_id, sender_type, sender_id,
|
|
397
|
+
event_at, content, content_rendered, metadata_json, sequence, source_id
|
|
398
|
+
FROM ai_chat_messages
|
|
399
|
+
WHERE source_id = ?
|
|
400
|
+
ORDER BY event_at ASC
|
|
401
|
+
"""
|
|
402
|
+
cursor = db_conn.execute(query, (source_id,))
|
|
403
|
+
|
|
404
|
+
# Convert rows to dictionaries
|
|
405
|
+
canonical_messages: List[Dict[str, Any]] = []
|
|
406
|
+
for row in cursor.fetchall():
|
|
407
|
+
canonical_messages.append({
|
|
408
|
+
"message_id": row[0],
|
|
409
|
+
"conversation_id": row[1],
|
|
410
|
+
"sender_type": row[2],
|
|
411
|
+
"sender_id": row[3],
|
|
412
|
+
"event_at": row[4],
|
|
413
|
+
"content": row[5],
|
|
414
|
+
"content_rendered": row[6],
|
|
415
|
+
"metadata_json": row[7],
|
|
416
|
+
"sequence": row[8],
|
|
417
|
+
"source_id": row[9],
|
|
418
|
+
})
|
|
419
|
+
|
|
420
|
+
logger.debug(
|
|
421
|
+
"Found %d canonical messages for source_id=%s, dataset_id=%s",
|
|
422
|
+
len(canonical_messages),
|
|
423
|
+
source_id,
|
|
424
|
+
dataset_id,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.error("Failed to read canonical messages from ai_chat_messages table: %s", e)
|
|
429
|
+
return []
|
|
430
|
+
|
|
431
|
+
if not canonical_messages:
|
|
432
|
+
logger.debug("No canonical messages found for source_id=%s, dataset_id=%s", source_id, dataset_id)
|
|
433
|
+
return []
|
|
434
|
+
|
|
435
|
+
# Check which messages have already been enriched
|
|
436
|
+
# Get enriched message IDs for each job's table
|
|
437
|
+
enriched_ids: set[str] = set()
|
|
438
|
+
# Create a mapping from job name to table name using the job registry
|
|
439
|
+
job_to_table = {job.get_job_name(): job.get_derived_table() for job in CANONICAL_JOBS}
|
|
440
|
+
|
|
441
|
+
for job_name in jobs_to_check:
|
|
442
|
+
table_name = job_to_table.get(job_name)
|
|
443
|
+
if table_name:
|
|
444
|
+
enriched_ids.update(_get_enriched_message_ids(table_name, db_conn))
|
|
445
|
+
else:
|
|
446
|
+
logger.warning("Unknown enrichment job: %s (skipping check)", job_name)
|
|
447
|
+
|
|
448
|
+
# Filter to unprocessed messages
|
|
449
|
+
unprocessed = [
|
|
450
|
+
msg for msg in canonical_messages
|
|
451
|
+
if msg.get("message_id") not in enriched_ids
|
|
452
|
+
]
|
|
453
|
+
|
|
454
|
+
logger.debug(
|
|
455
|
+
"Found %d unprocessed messages out of %d total canonical messages for source_id=%s",
|
|
456
|
+
len(unprocessed),
|
|
457
|
+
len(canonical_messages),
|
|
458
|
+
source_id,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
return unprocessed
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
async def _process_enrichment_core(
|
|
465
|
+
source_id: str,
|
|
466
|
+
dataset_id: Optional[str] = None,
|
|
467
|
+
job_names: Optional[List[str]] = None,
|
|
468
|
+
force_reprocess: bool = False,
|
|
469
|
+
) -> Dict[str, Any]:
|
|
470
|
+
"""Core logic for processing enrichment (reusable from HTTP and WebSocket).
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
source_id: Source identifier
|
|
474
|
+
dataset_id: Optional dataset ID to filter by
|
|
475
|
+
job_names: Optional list of specific enrichment jobs to run
|
|
476
|
+
force_reprocess: If True, reprocess even if already enriched
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Processing results
|
|
480
|
+
"""
|
|
481
|
+
# Get source definition
|
|
482
|
+
source_def = REGISTRY.get(source_id)
|
|
483
|
+
if not source_def:
|
|
484
|
+
raise ValueError(f"Source {source_id} not found")
|
|
485
|
+
|
|
486
|
+
# Determine which jobs to run
|
|
487
|
+
jobs_to_run = job_names or source_def.canonical_enrichment_jobs
|
|
488
|
+
if not jobs_to_run:
|
|
489
|
+
return {
|
|
490
|
+
"status": "ok",
|
|
491
|
+
"message": "No enrichment jobs configured for this source",
|
|
492
|
+
"messages_processed": 0,
|
|
493
|
+
"records_created": {},
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
# Get database connection
|
|
497
|
+
db_conn = get_db_connection()
|
|
498
|
+
if not db_conn:
|
|
499
|
+
return {
|
|
500
|
+
"status": "error",
|
|
501
|
+
"message": "Database connection not available",
|
|
502
|
+
"messages_processed": 0,
|
|
503
|
+
"records_created": {},
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
# Find unprocessed messages
|
|
507
|
+
if force_reprocess:
|
|
508
|
+
# For force reprocess, load all canonical messages regardless of enrichment status
|
|
509
|
+
# Read directly from canonical table (source of truth)
|
|
510
|
+
try:
|
|
511
|
+
# Check if ai_chat_messages table exists
|
|
512
|
+
cursor = db_conn.execute("""
|
|
513
|
+
SELECT name FROM sqlite_master
|
|
514
|
+
WHERE type='table' AND name='ai_chat_messages'
|
|
515
|
+
""")
|
|
516
|
+
if not cursor.fetchone():
|
|
517
|
+
return {
|
|
518
|
+
"status": "ok",
|
|
519
|
+
"message": "No canonical messages found",
|
|
520
|
+
"messages_processed": 0,
|
|
521
|
+
"records_created": {},
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
# Check if ai_chat_conversations table exists for dataset_id filtering
|
|
525
|
+
cursor = db_conn.execute("""
|
|
526
|
+
SELECT name FROM sqlite_master
|
|
527
|
+
WHERE type='table' AND name='ai_chat_conversations'
|
|
528
|
+
""")
|
|
529
|
+
has_conversations_table = cursor.fetchone() is not None
|
|
530
|
+
|
|
531
|
+
# Build query to read all canonical messages
|
|
532
|
+
if has_conversations_table and dataset_id:
|
|
533
|
+
user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
|
|
534
|
+
# Use INNER JOIN to ensure we only get messages with matching conversations
|
|
535
|
+
query = """
|
|
536
|
+
SELECT m.message_id, m.conversation_id, m.sender_type, m.sender_id,
|
|
537
|
+
m.event_at, m.content, m.content_rendered, m.metadata_json, m.sequence, m.source_id
|
|
538
|
+
FROM ai_chat_messages m
|
|
539
|
+
INNER JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
|
|
540
|
+
WHERE m.source_id = ? AND c.owner_user_id = ?
|
|
541
|
+
ORDER BY m.event_at ASC
|
|
542
|
+
"""
|
|
543
|
+
cursor = db_conn.execute(query, (source_id, user_id))
|
|
544
|
+
else:
|
|
545
|
+
query = """
|
|
546
|
+
SELECT message_id, conversation_id, sender_type, sender_id,
|
|
547
|
+
event_at, content, content_rendered, metadata_json, sequence, source_id
|
|
548
|
+
FROM ai_chat_messages
|
|
549
|
+
WHERE source_id = ?
|
|
550
|
+
ORDER BY event_at ASC
|
|
551
|
+
"""
|
|
552
|
+
cursor = db_conn.execute(query, (source_id,))
|
|
553
|
+
|
|
554
|
+
# Convert rows to dictionaries
|
|
555
|
+
unprocessed_messages = []
|
|
556
|
+
for row in cursor.fetchall():
|
|
557
|
+
unprocessed_messages.append({
|
|
558
|
+
"message_id": row[0],
|
|
559
|
+
"conversation_id": row[1],
|
|
560
|
+
"sender_type": row[2],
|
|
561
|
+
"sender_id": row[3],
|
|
562
|
+
"event_at": row[4],
|
|
563
|
+
"content": row[5],
|
|
564
|
+
"content_rendered": row[6],
|
|
565
|
+
"metadata_json": row[7],
|
|
566
|
+
"sequence": row[8],
|
|
567
|
+
"source_id": row[9],
|
|
568
|
+
})
|
|
569
|
+
except Exception as e:
|
|
570
|
+
logger.error("Failed to read canonical messages for force_reprocess: %s", e)
|
|
571
|
+
return {
|
|
572
|
+
"status": "error",
|
|
573
|
+
"message": f"Failed to read canonical messages: {e}",
|
|
574
|
+
"messages_processed": 0,
|
|
575
|
+
"records_created": {},
|
|
576
|
+
}
|
|
577
|
+
else:
|
|
578
|
+
unprocessed_messages = await _find_unprocessed_messages(source_id, dataset_id, jobs_to_run)
|
|
579
|
+
|
|
580
|
+
if not unprocessed_messages:
|
|
581
|
+
return {
|
|
582
|
+
"status": "ok",
|
|
583
|
+
"message": "No unprocessed messages found",
|
|
584
|
+
"messages_processed": 0,
|
|
585
|
+
"records_created": {},
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Run enrichment
|
|
589
|
+
tables_manager = DerivedTablesManager(conn=db_conn)
|
|
590
|
+
orchestrator = EnrichmentOrchestrator(tables_manager=tables_manager)
|
|
591
|
+
|
|
592
|
+
logger.info(
|
|
593
|
+
"[PIPELINE:ENRICHMENT] %s: Manual enrichment triggered: source_id=%s, messages=%d, jobs=%s",
|
|
594
|
+
orchestrator,
|
|
595
|
+
source_id,
|
|
596
|
+
len(unprocessed_messages),
|
|
597
|
+
jobs_to_run,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Define progress callback to update progress during execution
|
|
601
|
+
progress_callback = None
|
|
602
|
+
try:
|
|
603
|
+
from ..enrichment.progress import get_progress
|
|
604
|
+
# Try to get progress object if it exists (created by handler)
|
|
605
|
+
progress_obj = get_progress(source_id) # Use source_id as fallback lookup
|
|
606
|
+
if progress_obj:
|
|
607
|
+
def progress_callback(
|
|
608
|
+
processed_count: int,
|
|
609
|
+
total_count: int,
|
|
610
|
+
job_name: str,
|
|
611
|
+
job_percent: float,
|
|
612
|
+
current_job_progress: float,
|
|
613
|
+
):
|
|
614
|
+
"""Update progress as jobs execute."""
|
|
615
|
+
estimated_messages_processed = int((job_percent / 100) * total_count)
|
|
616
|
+
jobs_complete = int((job_percent / 100) * len(jobs_to_run))
|
|
617
|
+
progress_obj.update(
|
|
618
|
+
messages_processed=estimated_messages_processed,
|
|
619
|
+
messages_skipped=0,
|
|
620
|
+
current_job_name=job_name,
|
|
621
|
+
current_job_progress_percent=current_job_progress,
|
|
622
|
+
jobs_complete=jobs_complete,
|
|
623
|
+
jobs_total=len(jobs_to_run),
|
|
624
|
+
)
|
|
625
|
+
except Exception:
|
|
626
|
+
pass # Progress callback is optional
|
|
627
|
+
|
|
628
|
+
enrichment_result = await orchestrator.run_canonical(
|
|
629
|
+
unprocessed_messages,
|
|
630
|
+
job_names=jobs_to_run,
|
|
631
|
+
progress_callback=progress_callback,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
return {
|
|
635
|
+
"status": "ok",
|
|
636
|
+
"source_id": source_id,
|
|
637
|
+
"messages_processed": len(unprocessed_messages),
|
|
638
|
+
"jobs_run": enrichment_result.get("jobs_run", 0),
|
|
639
|
+
"records_created": enrichment_result.get("records_created", {}),
|
|
640
|
+
"errors": enrichment_result.get("errors", []),
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
async def _get_enrichment_status_core(
|
|
645
|
+
source_id: str,
|
|
646
|
+
dataset_id: Optional[str] = None,
|
|
647
|
+
) -> Dict[str, Any]:
|
|
648
|
+
"""Core logic for getting enrichment status (reusable from HTTP and WebSocket).
|
|
649
|
+
|
|
650
|
+
This function reads directly from the ai_chat_messages table (canonical table)
|
|
651
|
+
as the source of truth, per the architecture design.
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
Status information including counts of processed/unprocessed messages
|
|
655
|
+
"""
|
|
656
|
+
source_def = REGISTRY.get(source_id)
|
|
657
|
+
if not source_def:
|
|
658
|
+
raise ValueError(f"Source {source_id} not found")
|
|
659
|
+
|
|
660
|
+
# Get database connection
|
|
661
|
+
db_conn = get_db_connection()
|
|
662
|
+
if not db_conn:
|
|
663
|
+
return {
|
|
664
|
+
"status": "error",
|
|
665
|
+
"source_id": source_id,
|
|
666
|
+
"total_messages": 0,
|
|
667
|
+
"processed_messages": 0,
|
|
668
|
+
"unprocessed_messages": 0,
|
|
669
|
+
"enrichment_jobs": source_def.canonical_enrichment_jobs,
|
|
670
|
+
"enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
|
|
671
|
+
"message": "Database connection not available",
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
# Read canonical messages directly from ai_chat_messages table
|
|
675
|
+
try:
|
|
676
|
+
# Check if ai_chat_messages table exists
|
|
677
|
+
cursor = db_conn.execute("""
|
|
678
|
+
SELECT name FROM sqlite_master
|
|
679
|
+
WHERE type='table' AND name='ai_chat_messages'
|
|
680
|
+
""")
|
|
681
|
+
if not cursor.fetchone():
|
|
682
|
+
return {
|
|
683
|
+
"status": "ok",
|
|
684
|
+
"source_id": source_id,
|
|
685
|
+
"total_messages": 0,
|
|
686
|
+
"processed_messages": 0,
|
|
687
|
+
"unprocessed_messages": 0,
|
|
688
|
+
"enrichment_jobs": source_def.canonical_enrichment_jobs,
|
|
689
|
+
"enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
|
|
690
|
+
"message": "Canonical table does not exist yet",
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
# Check if ai_chat_conversations table exists for dataset_id filtering
|
|
694
|
+
cursor = db_conn.execute("""
|
|
695
|
+
SELECT name FROM sqlite_master
|
|
696
|
+
WHERE type='table' AND name='ai_chat_conversations'
|
|
697
|
+
""")
|
|
698
|
+
has_conversations_table = cursor.fetchone() is not None
|
|
699
|
+
|
|
700
|
+
# Build query to count messages from canonical table
|
|
701
|
+
if has_conversations_table and dataset_id:
|
|
702
|
+
# Join with conversations table to filter by owner_user_id
|
|
703
|
+
user_id = dataset_id.split(":")[0] if ":" in dataset_id else dataset_id
|
|
704
|
+
query = """
|
|
705
|
+
SELECT COUNT(*)
|
|
706
|
+
FROM ai_chat_messages m
|
|
707
|
+
LEFT JOIN ai_chat_conversations c ON m.conversation_id = c.conversation_id
|
|
708
|
+
WHERE m.source_id = ? AND c.owner_user_id = ?
|
|
709
|
+
"""
|
|
710
|
+
cursor = db_conn.execute(query, (source_id, user_id))
|
|
711
|
+
else:
|
|
712
|
+
# Direct query without user filtering
|
|
713
|
+
query = "SELECT COUNT(*) FROM ai_chat_messages WHERE source_id = ?"
|
|
714
|
+
cursor = db_conn.execute(query, (source_id,))
|
|
715
|
+
|
|
716
|
+
total = cursor.fetchone()[0]
|
|
717
|
+
|
|
718
|
+
except Exception as e:
|
|
719
|
+
logger.error("Failed to read canonical messages from ai_chat_messages table: %s", e)
|
|
720
|
+
return {
|
|
721
|
+
"status": "error",
|
|
722
|
+
"source_id": source_id,
|
|
723
|
+
"total_messages": 0,
|
|
724
|
+
"processed_messages": 0,
|
|
725
|
+
"unprocessed_messages": 0,
|
|
726
|
+
"enrichment_jobs": source_def.canonical_enrichment_jobs,
|
|
727
|
+
"enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
|
|
728
|
+
"message": f"Error reading canonical table: {e}",
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
# Get unprocessed messages count (reuse the logic from _find_unprocessed_messages)
|
|
732
|
+
unprocessed = await _find_unprocessed_messages(source_id, dataset_id)
|
|
733
|
+
unprocessed_count = len(unprocessed)
|
|
734
|
+
processed_count = total - unprocessed_count
|
|
735
|
+
|
|
736
|
+
return {
|
|
737
|
+
"status": "ok",
|
|
738
|
+
"source_id": source_id,
|
|
739
|
+
"total_messages": total,
|
|
740
|
+
"processed_messages": processed_count,
|
|
741
|
+
"unprocessed_messages": unprocessed_count,
|
|
742
|
+
"enrichment_jobs": source_def.canonical_enrichment_jobs,
|
|
743
|
+
"enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
@router.post("/enrichment/process", dependencies=[Depends(require_api_key)])
|
|
748
|
+
async def process_enrichment(
|
|
749
|
+
source_id: str = Body(...),
|
|
750
|
+
dataset_id: Optional[str] = Body(None),
|
|
751
|
+
job_names: Optional[List[str]] = Body(None),
|
|
752
|
+
force_reprocess: bool = Body(False),
|
|
753
|
+
) -> Dict[str, Any]:
|
|
754
|
+
"""Manually trigger enrichment for unprocessed messages.
|
|
755
|
+
|
|
756
|
+
Args:
|
|
757
|
+
source_id: Source identifier
|
|
758
|
+
dataset_id: Optional dataset ID to filter by
|
|
759
|
+
job_names: Optional list of specific enrichment jobs to run
|
|
760
|
+
force_reprocess: If True, reprocess even if already enriched
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
Processing results
|
|
764
|
+
"""
|
|
765
|
+
try:
|
|
766
|
+
return await _process_enrichment_core(
|
|
767
|
+
source_id=source_id,
|
|
768
|
+
dataset_id=dataset_id,
|
|
769
|
+
job_names=job_names,
|
|
770
|
+
force_reprocess=force_reprocess,
|
|
771
|
+
)
|
|
772
|
+
except ValueError as e:
|
|
773
|
+
raise HTTPException(status_code=404, detail=str(e))
|
|
774
|
+
except Exception as e:
|
|
775
|
+
logger.error("Manual enrichment failed: %s", e, exc_info=True)
|
|
776
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
@router.get("/enrichment/status", dependencies=[Depends(require_api_key)])
|
|
780
|
+
async def get_processing_status(
|
|
781
|
+
source_id: str,
|
|
782
|
+
dataset_id: Optional[str] = None,
|
|
783
|
+
) -> Dict[str, Any]:
|
|
784
|
+
"""Get enrichment status for a source.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
Status information including counts of processed/unprocessed messages
|
|
788
|
+
"""
|
|
789
|
+
try:
|
|
790
|
+
return await _get_enrichment_status_core(
|
|
791
|
+
source_id=source_id,
|
|
792
|
+
dataset_id=dataset_id,
|
|
793
|
+
)
|
|
794
|
+
except ValueError as e:
|
|
795
|
+
raise HTTPException(status_code=404, detail=str(e))
|
|
796
|
+
except Exception as e:
|
|
797
|
+
logger.error("Failed to get enrichment status: %s", e, exc_info=True)
|
|
798
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
@router.get(
|
|
802
|
+
"/sources/{source_id}/enrichments",
|
|
803
|
+
dependencies=[Depends(require_api_key)],
|
|
804
|
+
)
|
|
805
|
+
async def list_source_enrichments(source_id: str) -> Dict[str, Any]:
|
|
806
|
+
"""List enrichment capabilities for a specific source."""
|
|
807
|
+
source_def = REGISTRY.get(source_id)
|
|
808
|
+
if not source_def:
|
|
809
|
+
raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
|
|
810
|
+
|
|
811
|
+
raw_jobs = list(getattr(source_def, "raw_enrichment_jobs", []) or [])
|
|
812
|
+
canonical_jobs = list(getattr(source_def, "canonical_enrichment_jobs", []) or [])
|
|
813
|
+
implemented_backfills = [
|
|
814
|
+
enrichment_name
|
|
815
|
+
for (sid, enrichment_name) in _RAW_SOURCE_BACKFILL_HANDLERS.keys()
|
|
816
|
+
if sid == source_id
|
|
817
|
+
]
|
|
818
|
+
implemented_backfills.sort()
|
|
819
|
+
capabilities: List[Dict[str, Any]] = []
|
|
820
|
+
for name in raw_jobs:
|
|
821
|
+
key = (source_id, name)
|
|
822
|
+
capabilities.append(
|
|
823
|
+
{
|
|
824
|
+
"name": name,
|
|
825
|
+
"supports_backfill": key in _RAW_SOURCE_BACKFILL_HANDLERS,
|
|
826
|
+
"supports_test": key in _RAW_SOURCE_TEST_HANDLERS,
|
|
827
|
+
"test_input_schema": _RAW_SOURCE_TEST_SCHEMAS.get(key),
|
|
828
|
+
}
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
return {
|
|
832
|
+
"status": "ok",
|
|
833
|
+
"source_id": source_id,
|
|
834
|
+
"ingestion_trigger": getattr(source_def, "ingestion_trigger", "automatic"),
|
|
835
|
+
"enrichment_trigger": getattr(source_def, "enrichment_trigger", "automatic"),
|
|
836
|
+
"raw_enrichments": raw_jobs,
|
|
837
|
+
"raw_enrichment_capabilities": capabilities,
|
|
838
|
+
"canonical_enrichments": canonical_jobs,
|
|
839
|
+
"raw_backfill_supported": implemented_backfills,
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
@router.post(
|
|
844
|
+
"/sources/{source_id}/enrichments/{enrichment_name}/backfill",
|
|
845
|
+
dependencies=[Depends(require_api_key)],
|
|
846
|
+
)
|
|
847
|
+
async def backfill_source_enrichment(
|
|
848
|
+
source_id: str,
|
|
849
|
+
enrichment_name: str,
|
|
850
|
+
only_missing: bool = Body(True),
|
|
851
|
+
limit: Optional[int] = Body(None),
|
|
852
|
+
) -> Dict[str, Any]:
|
|
853
|
+
"""Backfill an enrichment for an ingestion source's existing rows.
|
|
854
|
+
|
|
855
|
+
This endpoint is source-scoped (raw/source layer), separate from canonical
|
|
856
|
+
message enrichment endpoints.
|
|
857
|
+
"""
|
|
858
|
+
source_def = REGISTRY.get(source_id)
|
|
859
|
+
if not source_def:
|
|
860
|
+
raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
|
|
861
|
+
|
|
862
|
+
configured_raw_jobs = set(getattr(source_def, "raw_enrichment_jobs", []) or [])
|
|
863
|
+
if enrichment_name not in configured_raw_jobs:
|
|
864
|
+
raise HTTPException(
|
|
865
|
+
status_code=400,
|
|
866
|
+
detail=(
|
|
867
|
+
f"Enrichment '{enrichment_name}' is not configured for source '{source_id}'. "
|
|
868
|
+
f"Configured raw enrichments: {sorted(configured_raw_jobs)}"
|
|
869
|
+
),
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
handler = _RAW_SOURCE_BACKFILL_HANDLERS.get((source_id, enrichment_name))
|
|
873
|
+
if not handler:
|
|
874
|
+
raise HTTPException(
|
|
875
|
+
status_code=501,
|
|
876
|
+
detail=f"Backfill for source='{source_id}' enrichment='{enrichment_name}' is not implemented",
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
db_conn = get_db_connection()
|
|
880
|
+
if not db_conn:
|
|
881
|
+
raise HTTPException(status_code=503, detail="Database connection not available")
|
|
882
|
+
|
|
883
|
+
try:
|
|
884
|
+
result = await handler(
|
|
885
|
+
db_conn=db_conn,
|
|
886
|
+
only_missing=only_missing,
|
|
887
|
+
limit=limit,
|
|
888
|
+
)
|
|
889
|
+
return {
|
|
890
|
+
"status": "ok",
|
|
891
|
+
"source_id": source_id,
|
|
892
|
+
"enrichment_name": enrichment_name,
|
|
893
|
+
"only_missing": only_missing,
|
|
894
|
+
"limit": limit,
|
|
895
|
+
**result,
|
|
896
|
+
}
|
|
897
|
+
except HTTPException:
|
|
898
|
+
raise
|
|
899
|
+
except Exception as exc: # noqa: BLE001
|
|
900
|
+
logger.error(
|
|
901
|
+
"Source enrichment backfill failed: source=%s enrichment=%s error=%s",
|
|
902
|
+
source_id,
|
|
903
|
+
enrichment_name,
|
|
904
|
+
exc,
|
|
905
|
+
exc_info=True,
|
|
906
|
+
)
|
|
907
|
+
raise HTTPException(status_code=500, detail=str(exc))
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
@router.post(
|
|
911
|
+
"/sources/{source_id}/enrichments/{enrichment_name}/test",
|
|
912
|
+
dependencies=[Depends(require_api_key)],
|
|
913
|
+
)
|
|
914
|
+
async def test_source_enrichment(
|
|
915
|
+
source_id: str,
|
|
916
|
+
enrichment_name: str,
|
|
917
|
+
data_packet: Dict[str, Any] = Body(...),
|
|
918
|
+
) -> Dict[str, Any]:
|
|
919
|
+
"""Test-run a source enrichment against a provided data packet."""
|
|
920
|
+
source_def = REGISTRY.get(source_id)
|
|
921
|
+
if not source_def:
|
|
922
|
+
raise HTTPException(status_code=404, detail=f"Source {source_id} not found")
|
|
923
|
+
|
|
924
|
+
configured_raw_jobs = set(getattr(source_def, "raw_enrichment_jobs", []) or [])
|
|
925
|
+
if enrichment_name not in configured_raw_jobs:
|
|
926
|
+
raise HTTPException(
|
|
927
|
+
status_code=400,
|
|
928
|
+
detail=(
|
|
929
|
+
f"Enrichment '{enrichment_name}' is not configured for source '{source_id}'. "
|
|
930
|
+
f"Configured raw enrichments: {sorted(configured_raw_jobs)}"
|
|
931
|
+
),
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
handler = _RAW_SOURCE_TEST_HANDLERS.get((source_id, enrichment_name))
|
|
935
|
+
if not handler:
|
|
936
|
+
raise HTTPException(
|
|
937
|
+
status_code=501,
|
|
938
|
+
detail=f"Test for source='{source_id}' enrichment='{enrichment_name}' is not implemented",
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
try:
|
|
942
|
+
result = await handler(data_packet=data_packet)
|
|
943
|
+
return {
|
|
944
|
+
"status": "ok",
|
|
945
|
+
"source_id": source_id,
|
|
946
|
+
"enrichment_name": enrichment_name,
|
|
947
|
+
**result,
|
|
948
|
+
}
|
|
949
|
+
except HTTPException:
|
|
950
|
+
raise
|
|
951
|
+
except Exception as exc: # noqa: BLE001
|
|
952
|
+
logger.error(
|
|
953
|
+
"Source enrichment test failed: source=%s enrichment=%s error=%s",
|
|
954
|
+
source_id,
|
|
955
|
+
enrichment_name,
|
|
956
|
+
exc,
|
|
957
|
+
exc_info=True,
|
|
958
|
+
)
|
|
959
|
+
raise HTTPException(status_code=500, detail=str(exc))
|