topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, AsyncIterator, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from .checkpoints.checkpoint_store import CheckpointStore, IngestionCheckpoint
|
|
13
|
+
from .parser import parse_file
|
|
14
|
+
from .parsers import PARSER_REGISTRY
|
|
15
|
+
from .progress import IngestionProgress
|
|
16
|
+
from .sources.base import RawRecord
|
|
17
|
+
from .state_machine import IngestionJob
|
|
18
|
+
from ..canonicalization.mappers import MAPPER_REGISTRY
|
|
19
|
+
from ..config.settings import settings
|
|
20
|
+
from ..enrichment.derived_tables import DerivedTablesManager
|
|
21
|
+
from ..enrichment.jobs import CANONICAL_JOBS
|
|
22
|
+
from ..enrichment.orchestrator import EnrichmentOrchestrator
|
|
23
|
+
from ..enrichment.progress_bar import ProgressBar
|
|
24
|
+
from ..engine.usage_observation import emit_usage_observation
|
|
25
|
+
from ..sources.registry import REGISTRY
|
|
26
|
+
from ..storage.db.postgres import connect_postgres
|
|
27
|
+
from ..storage.raw.file_store import RawFileStore
|
|
28
|
+
from ..utils.base_object import BaseObject
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger("topos.ingestion.manager")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _owner_user_id_from_dataset_id(dataset_id: Optional[str]) -> Optional[str]:
|
|
34
|
+
raw = str(dataset_id or "").strip()
|
|
35
|
+
if not raw or ":" not in raw:
|
|
36
|
+
return None
|
|
37
|
+
owner = raw.split(":", 1)[0].strip()
|
|
38
|
+
return owner or None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _control_plane_base_url(raw_url: Optional[str]) -> str:
|
|
42
|
+
value = str(raw_url or "").strip()
|
|
43
|
+
if value.startswith("wss://"):
|
|
44
|
+
return value.replace("wss://", "https://").split("/ws/")[0]
|
|
45
|
+
if value.startswith("ws://"):
|
|
46
|
+
return value.replace("ws://", "http://").split("/ws/")[0]
|
|
47
|
+
return value.rstrip("/")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _filter_unenriched_messages(
|
|
51
|
+
canonical_messages: List[Dict[str, Any]],
|
|
52
|
+
job_names: List[str],
|
|
53
|
+
tables_manager: DerivedTablesManager,
|
|
54
|
+
*,
|
|
55
|
+
source_id: Optional[str] = None,
|
|
56
|
+
dataset_id: Optional[str] = None,
|
|
57
|
+
) -> List[Dict[str, Any]]:
|
|
58
|
+
"""Filter out messages that have already been enriched.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
canonical_messages: List of canonical message dictionaries
|
|
62
|
+
job_names: List of enrichment job names to check
|
|
63
|
+
tables_manager: DerivedTablesManager instance for database access
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of messages that haven't been enriched yet
|
|
67
|
+
"""
|
|
68
|
+
if not canonical_messages or not job_names:
|
|
69
|
+
return canonical_messages
|
|
70
|
+
|
|
71
|
+
if not tables_manager.conn:
|
|
72
|
+
# No database connection, can't check - return all messages
|
|
73
|
+
logger.debug("[PIPELINE:ENRICHMENT] No database connection, processing all messages")
|
|
74
|
+
return canonical_messages
|
|
75
|
+
|
|
76
|
+
# Create mapping from job name to table name
|
|
77
|
+
job_to_table = {job.get_job_name(): job.get_derived_table() for job in CANONICAL_JOBS}
|
|
78
|
+
|
|
79
|
+
# Get set of message IDs that are already enriched for any of the jobs.
|
|
80
|
+
# Scope checks by source_id and (when available) dataset owner so one source/user
|
|
81
|
+
# does not suppress enrichment for another when message_id collides.
|
|
82
|
+
enriched_message_ids: set[str] = set()
|
|
83
|
+
candidate_ids = sorted(
|
|
84
|
+
{str(msg.get("message_id") or "").strip() for msg in canonical_messages if str(msg.get("message_id") or "").strip()}
|
|
85
|
+
)
|
|
86
|
+
if not candidate_ids:
|
|
87
|
+
return canonical_messages
|
|
88
|
+
owner_user_id = ""
|
|
89
|
+
if dataset_id:
|
|
90
|
+
owner_user_id = dataset_id.split(":", 1)[0].strip() if ":" in dataset_id else str(dataset_id).strip()
|
|
91
|
+
|
|
92
|
+
for job_name in job_names:
|
|
93
|
+
table_name = job_to_table.get(job_name)
|
|
94
|
+
if not table_name:
|
|
95
|
+
logger.warning("[PIPELINE:ENRICHMENT] Unknown job name: %s, skipping check", job_name)
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# Check if table exists
|
|
100
|
+
cursor = tables_manager.conn.execute("""
|
|
101
|
+
SELECT name FROM sqlite_master
|
|
102
|
+
WHERE type='table' AND name=?
|
|
103
|
+
""", (table_name,))
|
|
104
|
+
if not cursor.fetchone():
|
|
105
|
+
# Table doesn't exist yet, no messages are enriched
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
placeholders = ",".join("?" for _ in candidate_ids)
|
|
109
|
+
|
|
110
|
+
# Prefer scoped join against canonical tables when present.
|
|
111
|
+
params: list[Any] = []
|
|
112
|
+
if source_id:
|
|
113
|
+
if owner_user_id:
|
|
114
|
+
cursor = tables_manager.conn.execute(
|
|
115
|
+
"""
|
|
116
|
+
SELECT name FROM sqlite_master
|
|
117
|
+
WHERE type='table' AND name='ai_chat_conversations'
|
|
118
|
+
"""
|
|
119
|
+
)
|
|
120
|
+
has_conversations = cursor.fetchone() is not None
|
|
121
|
+
if has_conversations:
|
|
122
|
+
params = [source_id, owner_user_id, *candidate_ids]
|
|
123
|
+
cursor = tables_manager.conn.execute(
|
|
124
|
+
f"""
|
|
125
|
+
SELECT DISTINCT d.message_id
|
|
126
|
+
FROM {table_name} d
|
|
127
|
+
INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
|
|
128
|
+
INNER JOIN ai_chat_conversations c ON c.conversation_id = m.conversation_id
|
|
129
|
+
WHERE m.source_id = ? AND c.owner_user_id = ? AND d.message_id IN ({placeholders})
|
|
130
|
+
""",
|
|
131
|
+
tuple(params),
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
params = [source_id, *candidate_ids]
|
|
135
|
+
cursor = tables_manager.conn.execute(
|
|
136
|
+
f"""
|
|
137
|
+
SELECT DISTINCT d.message_id
|
|
138
|
+
FROM {table_name} d
|
|
139
|
+
INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
|
|
140
|
+
WHERE m.source_id = ? AND d.message_id IN ({placeholders})
|
|
141
|
+
""",
|
|
142
|
+
tuple(params),
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
params = [source_id, *candidate_ids]
|
|
146
|
+
cursor = tables_manager.conn.execute(
|
|
147
|
+
f"""
|
|
148
|
+
SELECT DISTINCT d.message_id
|
|
149
|
+
FROM {table_name} d
|
|
150
|
+
INNER JOIN ai_chat_messages m ON m.message_id = d.message_id
|
|
151
|
+
WHERE m.source_id = ? AND d.message_id IN ({placeholders})
|
|
152
|
+
""",
|
|
153
|
+
tuple(params),
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
cursor = tables_manager.conn.execute(
|
|
157
|
+
f"SELECT DISTINCT message_id FROM {table_name} WHERE message_id IN ({placeholders})",
|
|
158
|
+
tuple(candidate_ids),
|
|
159
|
+
)
|
|
160
|
+
enriched_message_ids.update(str(row[0]) for row in cursor.fetchall() if row and row[0])
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.warning(
|
|
163
|
+
"[PIPELINE:ENRICHMENT] Failed to check enriched messages in %s: %s",
|
|
164
|
+
table_name,
|
|
165
|
+
e,
|
|
166
|
+
)
|
|
167
|
+
# On error, assume no messages are enriched (safer to process than skip)
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Filter to only messages that haven't been enriched
|
|
171
|
+
unenriched = [
|
|
172
|
+
msg for msg in canonical_messages
|
|
173
|
+
if msg.get("message_id") not in enriched_message_ids
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
if len(unenriched) < len(canonical_messages):
|
|
177
|
+
logger.debug(
|
|
178
|
+
"[PIPELINE:ENRICHMENT] Filtered %d already-enriched messages, %d new messages to process",
|
|
179
|
+
len(canonical_messages) - len(unenriched),
|
|
180
|
+
len(unenriched),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return unenriched
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
async def _read_file_bytes(file_path: Path) -> AsyncIterator[bytes]:
|
|
187
|
+
def read_all() -> bytes:
|
|
188
|
+
return file_path.read_bytes()
|
|
189
|
+
|
|
190
|
+
file_data = await asyncio.to_thread(read_all)
|
|
191
|
+
chunk_size = 8192
|
|
192
|
+
for i in range(0, len(file_data), chunk_size):
|
|
193
|
+
yield file_data[i : i + chunk_size]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
_SQL_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _is_valid_sql_identifier(value: str) -> bool:
|
|
200
|
+
return bool(_SQL_IDENTIFIER_RE.match(value or ""))
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _sql_type_for_source_column(column_type: str) -> str:
|
|
204
|
+
ctype = str(column_type or "").strip().lower()
|
|
205
|
+
if ctype in {"identifier", "text"}:
|
|
206
|
+
return "TEXT"
|
|
207
|
+
if ctype in {"real", "float", "number"}:
|
|
208
|
+
return "REAL"
|
|
209
|
+
if ctype in {"integer", "int"}:
|
|
210
|
+
return "INTEGER"
|
|
211
|
+
if ctype in {"json"}:
|
|
212
|
+
return "TEXT"
|
|
213
|
+
return "TEXT"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _coerce_table_value(value: Any, *, declared_type: str) -> Any:
|
|
217
|
+
ctype = str(declared_type or "").strip().lower()
|
|
218
|
+
if value is None:
|
|
219
|
+
return None
|
|
220
|
+
if ctype == "json":
|
|
221
|
+
if isinstance(value, str):
|
|
222
|
+
return value
|
|
223
|
+
return json.dumps(value, ensure_ascii=True)
|
|
224
|
+
if isinstance(value, (dict, list)):
|
|
225
|
+
return json.dumps(value, ensure_ascii=True)
|
|
226
|
+
return value
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _tokenize_path(path: str) -> List[str]:
|
|
230
|
+
return [part.strip() for part in str(path).split(".") if part.strip()]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _walk_path_step(nodes: List[Any], token: str) -> List[Any]:
|
|
234
|
+
out: List[Any] = []
|
|
235
|
+
if token == "*":
|
|
236
|
+
for node in nodes:
|
|
237
|
+
if isinstance(node, dict):
|
|
238
|
+
out.extend(node.values())
|
|
239
|
+
elif isinstance(node, list):
|
|
240
|
+
out.extend(node)
|
|
241
|
+
return out
|
|
242
|
+
|
|
243
|
+
list_mode = token.endswith("[*]")
|
|
244
|
+
key = token[:-3] if list_mode else token
|
|
245
|
+
for node in nodes:
|
|
246
|
+
if not isinstance(node, dict):
|
|
247
|
+
continue
|
|
248
|
+
if key not in node:
|
|
249
|
+
continue
|
|
250
|
+
value = node.get(key)
|
|
251
|
+
if list_mode:
|
|
252
|
+
if isinstance(value, list):
|
|
253
|
+
out.extend(value)
|
|
254
|
+
elif value is not None:
|
|
255
|
+
out.append(value)
|
|
256
|
+
else:
|
|
257
|
+
out.append(value)
|
|
258
|
+
return out
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _extract_path_value(payload: Dict[str, Any], path: str) -> Any:
|
|
262
|
+
if not path:
|
|
263
|
+
return payload
|
|
264
|
+
nodes: List[Any] = [payload]
|
|
265
|
+
for token in _tokenize_path(path):
|
|
266
|
+
nodes = _walk_path_step(nodes, token)
|
|
267
|
+
if not nodes:
|
|
268
|
+
return None
|
|
269
|
+
if len(nodes) == 1:
|
|
270
|
+
return nodes[0]
|
|
271
|
+
return nodes
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _expand_file_records(raw_payload: Dict[str, Any], source_def: Optional[Any]) -> List[Dict[str, Any]]:
|
|
275
|
+
if not isinstance(raw_payload, dict):
|
|
276
|
+
return []
|
|
277
|
+
ingest_shape = getattr(source_def, "file_ingest_shape", None) if source_def else None
|
|
278
|
+
if not isinstance(ingest_shape, dict):
|
|
279
|
+
return [raw_payload]
|
|
280
|
+
record_path = str(ingest_shape.get("raw_record_path") or "").strip()
|
|
281
|
+
if not record_path:
|
|
282
|
+
return [raw_payload]
|
|
283
|
+
extracted = _extract_path_value(raw_payload, record_path)
|
|
284
|
+
if isinstance(extracted, list):
|
|
285
|
+
return [item for item in extracted if isinstance(item, dict)]
|
|
286
|
+
if isinstance(extracted, dict):
|
|
287
|
+
return [extracted]
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _persist_source_data_tables(
|
|
292
|
+
*,
|
|
293
|
+
db_conn: Any,
|
|
294
|
+
source_def: Optional[Any],
|
|
295
|
+
dataset_id: str,
|
|
296
|
+
normalized_records: List[Any],
|
|
297
|
+
) -> None:
|
|
298
|
+
# Hosted mode should persist source tables in Postgres so rows survive engine restarts.
|
|
299
|
+
if settings.topos_database_mode == "postgres":
|
|
300
|
+
with connect_postgres() as hosted_conn:
|
|
301
|
+
_persist_source_data_tables_on_connection(
|
|
302
|
+
db_conn=hosted_conn,
|
|
303
|
+
source_def=source_def,
|
|
304
|
+
dataset_id=dataset_id,
|
|
305
|
+
normalized_records=normalized_records,
|
|
306
|
+
)
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
_persist_source_data_tables_on_connection(
|
|
310
|
+
db_conn=db_conn,
|
|
311
|
+
source_def=source_def,
|
|
312
|
+
dataset_id=dataset_id,
|
|
313
|
+
normalized_records=normalized_records,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _persist_source_data_tables_on_connection(
|
|
318
|
+
*,
|
|
319
|
+
db_conn: Any,
|
|
320
|
+
source_def: Optional[Any],
|
|
321
|
+
dataset_id: str,
|
|
322
|
+
normalized_records: List[Any],
|
|
323
|
+
) -> None:
|
|
324
|
+
if not db_conn or not source_def:
|
|
325
|
+
return
|
|
326
|
+
if not bool(getattr(source_def, "pipeline_include_data_table", False)):
|
|
327
|
+
return
|
|
328
|
+
tables = getattr(source_def, "tables", None)
|
|
329
|
+
if not isinstance(tables, list) or not tables:
|
|
330
|
+
return
|
|
331
|
+
if not normalized_records:
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
owner_user_id: Optional[str] = None
|
|
335
|
+
tenant_id: Optional[str] = None
|
|
336
|
+
dataset_parts = [part for part in str(dataset_id or "").split(":") if part]
|
|
337
|
+
if dataset_parts:
|
|
338
|
+
owner_user_id = dataset_parts[0]
|
|
339
|
+
if len(dataset_parts) >= 3:
|
|
340
|
+
tenant_id = dataset_parts[2]
|
|
341
|
+
|
|
342
|
+
pooled_scope_columns: List[Dict[str, Any]] = [
|
|
343
|
+
{"name": "dataset_id", "type": "text"},
|
|
344
|
+
{"name": "owner_user_id", "type": "text"},
|
|
345
|
+
{"name": "tenant_id", "type": "text"},
|
|
346
|
+
]
|
|
347
|
+
|
|
348
|
+
for table in tables:
|
|
349
|
+
if not isinstance(table, dict):
|
|
350
|
+
continue
|
|
351
|
+
table_id = str(table.get("table_id") or "").strip()
|
|
352
|
+
columns = table.get("columns")
|
|
353
|
+
if not table_id or not _is_valid_sql_identifier(table_id):
|
|
354
|
+
logger.warning("[PIPELINE:DATA_TABLE] Skipping invalid table_id=%r", table_id)
|
|
355
|
+
continue
|
|
356
|
+
if not isinstance(columns, list) or not columns:
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
valid_columns: List[Dict[str, Any]] = []
|
|
360
|
+
for column in columns:
|
|
361
|
+
if not isinstance(column, dict):
|
|
362
|
+
continue
|
|
363
|
+
col_name = str(column.get("name") or "").strip()
|
|
364
|
+
if not col_name or not _is_valid_sql_identifier(col_name):
|
|
365
|
+
continue
|
|
366
|
+
valid_columns.append(column)
|
|
367
|
+
existing_names = {str(col.get("name") or "").strip() for col in valid_columns}
|
|
368
|
+
for pooled_col in pooled_scope_columns:
|
|
369
|
+
pooled_name = str(pooled_col["name"])
|
|
370
|
+
if pooled_name in existing_names:
|
|
371
|
+
continue
|
|
372
|
+
valid_columns.append(dict(pooled_col))
|
|
373
|
+
existing_names.add(pooled_name)
|
|
374
|
+
|
|
375
|
+
if not valid_columns:
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
defs: List[str] = []
|
|
379
|
+
pk_cols: List[str] = []
|
|
380
|
+
for column in valid_columns:
|
|
381
|
+
col_name = str(column.get("name")).strip()
|
|
382
|
+
col_type = _sql_type_for_source_column(str(column.get("type") or "text"))
|
|
383
|
+
defs.append(f'"{col_name}" {col_type}')
|
|
384
|
+
if bool(column.get("primary_key")):
|
|
385
|
+
pk_cols.append(col_name)
|
|
386
|
+
if pk_cols:
|
|
387
|
+
pk_sql = ", ".join([f'"{name}"' for name in pk_cols])
|
|
388
|
+
defs.append(f"PRIMARY KEY ({pk_sql})")
|
|
389
|
+
|
|
390
|
+
db_conn.execute(f'CREATE TABLE IF NOT EXISTS "{table_id}" ({", ".join(defs)})')
|
|
391
|
+
|
|
392
|
+
is_sqlite = "sqlite" in db_conn.__class__.__module__.lower()
|
|
393
|
+
try:
|
|
394
|
+
if is_sqlite:
|
|
395
|
+
existing_col_rows = db_conn.execute(f'PRAGMA table_info("{table_id}")').fetchall()
|
|
396
|
+
persisted_columns = {
|
|
397
|
+
str(row["name"]) if isinstance(row, dict) else str(row[1])
|
|
398
|
+
for row in existing_col_rows
|
|
399
|
+
}
|
|
400
|
+
else:
|
|
401
|
+
existing_col_rows = db_conn.execute(
|
|
402
|
+
"""
|
|
403
|
+
SELECT column_name
|
|
404
|
+
FROM information_schema.columns
|
|
405
|
+
WHERE table_schema='public' AND table_name=%s
|
|
406
|
+
""",
|
|
407
|
+
(table_id,),
|
|
408
|
+
).fetchall()
|
|
409
|
+
persisted_columns = {str(row[0]) for row in existing_col_rows}
|
|
410
|
+
except Exception:
|
|
411
|
+
persisted_columns = set()
|
|
412
|
+
|
|
413
|
+
for pooled_col in ("dataset_id", "owner_user_id", "tenant_id"):
|
|
414
|
+
if pooled_col in persisted_columns:
|
|
415
|
+
continue
|
|
416
|
+
db_conn.execute(f'ALTER TABLE "{table_id}" ADD COLUMN "{pooled_col}" TEXT')
|
|
417
|
+
persisted_columns.add(pooled_col)
|
|
418
|
+
|
|
419
|
+
column_names = [str(column.get("name")).strip() for column in valid_columns]
|
|
420
|
+
quoted_columns = ", ".join([f'"{name}"' for name in column_names])
|
|
421
|
+
placeholder_token = "?" if is_sqlite else "%s"
|
|
422
|
+
placeholders = ", ".join([placeholder_token] * len(column_names))
|
|
423
|
+
if is_sqlite:
|
|
424
|
+
sql = f'INSERT OR REPLACE INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders})'
|
|
425
|
+
else:
|
|
426
|
+
conflict_cols = [name for name in pk_cols if name in column_names]
|
|
427
|
+
non_pk_cols = [name for name in column_names if name not in conflict_cols]
|
|
428
|
+
if conflict_cols:
|
|
429
|
+
conflict_sql = ", ".join([f'"{name}"' for name in conflict_cols])
|
|
430
|
+
if non_pk_cols:
|
|
431
|
+
update_sql = ", ".join(
|
|
432
|
+
[f'"{name}" = EXCLUDED."{name}"' for name in non_pk_cols]
|
|
433
|
+
)
|
|
434
|
+
sql = (
|
|
435
|
+
f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders}) '
|
|
436
|
+
f'ON CONFLICT ({conflict_sql}) DO UPDATE SET {update_sql}'
|
|
437
|
+
)
|
|
438
|
+
else:
|
|
439
|
+
sql = (
|
|
440
|
+
f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders}) '
|
|
441
|
+
f'ON CONFLICT ({conflict_sql}) DO NOTHING'
|
|
442
|
+
)
|
|
443
|
+
else:
|
|
444
|
+
sql = f'INSERT INTO "{table_id}" ({quoted_columns}) VALUES ({placeholders})'
|
|
445
|
+
|
|
446
|
+
for normalized in normalized_records:
|
|
447
|
+
payload = normalized.payload if hasattr(normalized, "payload") else {}
|
|
448
|
+
if not isinstance(payload, dict):
|
|
449
|
+
continue
|
|
450
|
+
row_values: List[Any] = []
|
|
451
|
+
for column in valid_columns:
|
|
452
|
+
col_name = str(column.get("name")).strip()
|
|
453
|
+
raw_value = payload.get(col_name)
|
|
454
|
+
if raw_value is None and col_name == "dataset_id":
|
|
455
|
+
raw_value = dataset_id
|
|
456
|
+
if raw_value is None and col_name == "owner_user_id":
|
|
457
|
+
raw_value = owner_user_id
|
|
458
|
+
if raw_value is None and col_name == "tenant_id":
|
|
459
|
+
raw_value = tenant_id
|
|
460
|
+
if raw_value is None and col_name == "record_id":
|
|
461
|
+
raw_value = payload.get("id") or payload.get("message_id")
|
|
462
|
+
row_values.append(_coerce_table_value(raw_value, declared_type=str(column.get("type") or "")))
|
|
463
|
+
db_conn.execute(sql, tuple(row_values))
|
|
464
|
+
|
|
465
|
+
db_conn.commit()
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
async def _try_install_runtime_source_definition_from_control_plane(
|
|
469
|
+
*,
|
|
470
|
+
source_id: Optional[str],
|
|
471
|
+
schema_id: str,
|
|
472
|
+
user_id: Optional[str],
|
|
473
|
+
dataset_id: str,
|
|
474
|
+
progress_api_url: Optional[str],
|
|
475
|
+
progress_api_key: Optional[str],
|
|
476
|
+
) -> Optional[Any]:
|
|
477
|
+
"""Best-effort source install when runtime registry is stale."""
|
|
478
|
+
cp_base = _control_plane_base_url(progress_api_url or settings.topos_control_plane_url)
|
|
479
|
+
if not cp_base:
|
|
480
|
+
return None
|
|
481
|
+
token = str(progress_api_key or settings.topos_key or "").strip()
|
|
482
|
+
if not token:
|
|
483
|
+
return None
|
|
484
|
+
params = {
|
|
485
|
+
"user_id": str(user_id or "").strip(),
|
|
486
|
+
"dataset_id": str(dataset_id or "").strip(),
|
|
487
|
+
}
|
|
488
|
+
if not params["user_id"] or not params["dataset_id"]:
|
|
489
|
+
return None
|
|
490
|
+
try:
|
|
491
|
+
import httpx
|
|
492
|
+
from ..sources.runtime_install import install_source_definition
|
|
493
|
+
|
|
494
|
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
495
|
+
resp = await client.get(
|
|
496
|
+
f"{cp_base}/sources",
|
|
497
|
+
params=params,
|
|
498
|
+
headers={"Authorization": f"Bearer {token}"},
|
|
499
|
+
)
|
|
500
|
+
resp.raise_for_status()
|
|
501
|
+
payload = resp.json() if resp.content else {}
|
|
502
|
+
rows = payload.get("sources") if isinstance(payload, dict) else None
|
|
503
|
+
if not isinstance(rows, list):
|
|
504
|
+
return None
|
|
505
|
+
wanted_source_id = str(source_id or "").strip()
|
|
506
|
+
wanted_schema = str(schema_id or "").strip()
|
|
507
|
+
for row in rows:
|
|
508
|
+
if not isinstance(row, dict):
|
|
509
|
+
continue
|
|
510
|
+
row_source_id = str(row.get("source_id") or "").strip()
|
|
511
|
+
row_schema_id = str(row.get("schema_id") or "").strip()
|
|
512
|
+
if wanted_source_id and row_source_id != wanted_source_id:
|
|
513
|
+
continue
|
|
514
|
+
if wanted_schema and row_schema_id != wanted_schema:
|
|
515
|
+
continue
|
|
516
|
+
install_source_definition(row)
|
|
517
|
+
installed = REGISTRY.get(row_source_id)
|
|
518
|
+
if installed:
|
|
519
|
+
logger.info(
|
|
520
|
+
"[PIPELINE:MANAGER] Installed runtime source definition from control-plane: source_id=%s schema_id=%s",
|
|
521
|
+
row_source_id,
|
|
522
|
+
row_schema_id,
|
|
523
|
+
)
|
|
524
|
+
return installed
|
|
525
|
+
except Exception as exc:
|
|
526
|
+
logger.warning(
|
|
527
|
+
"[PIPELINE:MANAGER] Failed to install runtime source definition from control-plane (source_id=%s schema_id=%s): %s",
|
|
528
|
+
source_id,
|
|
529
|
+
schema_id,
|
|
530
|
+
exc,
|
|
531
|
+
)
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
@dataclass
|
|
536
|
+
class IngestionManager(BaseObject):
|
|
537
|
+
file_store: RawFileStore
|
|
538
|
+
checkpoint_store: Optional[CheckpointStore] = None
|
|
539
|
+
|
|
540
|
+
def __post_init__(self):
|
|
541
|
+
"""Initialize BaseObject after dataclass initialization."""
|
|
542
|
+
# Generate name if not set (dataclass doesn't call __init__)
|
|
543
|
+
if not hasattr(self, "_name"):
|
|
544
|
+
from ..utils.base_object import _next_instance_number
|
|
545
|
+
n = _next_instance_number(self.__class__)
|
|
546
|
+
object.__setattr__(self, "_name", f"{self.__class__.__name__}#{n}")
|
|
547
|
+
# Call parent __init__ to ensure BaseObject is properly initialized
|
|
548
|
+
BaseObject.__init__(self, name=getattr(self, "_name", None))
|
|
549
|
+
|
|
550
|
+
async def process_job(
|
|
551
|
+
self,
|
|
552
|
+
job: IngestionJob,
|
|
553
|
+
source_id: Optional[str] = None,
|
|
554
|
+
progress_api_url: Optional[str] = None,
|
|
555
|
+
progress_api_key: Optional[str] = None,
|
|
556
|
+
) -> Dict[str, Any]:
|
|
557
|
+
file_path = self.file_store.get_file_path(job.dataset_id, job.schema_id)
|
|
558
|
+
if not file_path.exists():
|
|
559
|
+
raise FileNotFoundError(f"Raw file not found: {file_path}")
|
|
560
|
+
|
|
561
|
+
parser_cls = PARSER_REGISTRY.get(job.schema_id)
|
|
562
|
+
if not parser_cls:
|
|
563
|
+
raise ValueError(f"No parser registered for schema: {job.schema_id}")
|
|
564
|
+
|
|
565
|
+
logger.debug(
|
|
566
|
+
"[PIPELINE:MANAGER] %s: Starting job processing: job_id=%s, dataset_id=%s, schema_id=%s, source_id=%s, file_path=%s",
|
|
567
|
+
self,
|
|
568
|
+
job.job_id,
|
|
569
|
+
job.dataset_id,
|
|
570
|
+
job.schema_id,
|
|
571
|
+
source_id,
|
|
572
|
+
file_path,
|
|
573
|
+
)
|
|
574
|
+
# Instantiate parser with schema_id (for v2 support)
|
|
575
|
+
parser = parser_cls(dataset_id=job.dataset_id, _schema_id=job.schema_id)
|
|
576
|
+
|
|
577
|
+
# Try to count total records for progress tracking (optional, may be None)
|
|
578
|
+
records_total = None
|
|
579
|
+
try:
|
|
580
|
+
# Count lines in file (approximation for JSONL)
|
|
581
|
+
if file_format == "jsonl":
|
|
582
|
+
with open(file_path, 'rb') as f:
|
|
583
|
+
records_total = sum(1 for _ in f)
|
|
584
|
+
except Exception:
|
|
585
|
+
pass # If counting fails, records_total remains None
|
|
586
|
+
|
|
587
|
+
progress = IngestionProgress(job_id=job.job_id, records_total=records_total)
|
|
588
|
+
progress_context = {
|
|
589
|
+
"user_id": _owner_user_id_from_dataset_id(job.dataset_id),
|
|
590
|
+
"dataset_id": job.dataset_id,
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
# Send initial progress update
|
|
594
|
+
if progress_api_url and progress_api_key:
|
|
595
|
+
try:
|
|
596
|
+
import httpx
|
|
597
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
598
|
+
await client.post(
|
|
599
|
+
f"{progress_api_url}/v1/ingestion/progress",
|
|
600
|
+
json={
|
|
601
|
+
"job_id": job.job_id,
|
|
602
|
+
**progress_context,
|
|
603
|
+
"status": "processing",
|
|
604
|
+
"progress_percent": 0.0,
|
|
605
|
+
"records_processed": 0,
|
|
606
|
+
"records_total": records_total,
|
|
607
|
+
"current_step": "starting",
|
|
608
|
+
},
|
|
609
|
+
headers={"Authorization": f"Bearer {progress_api_key}"},
|
|
610
|
+
)
|
|
611
|
+
except Exception as exc:
|
|
612
|
+
logger.warning("Failed to send initial ingestion progress: %s", exc)
|
|
613
|
+
|
|
614
|
+
# Find source definition: use source_id if provided, otherwise find by schema_id
|
|
615
|
+
source_def = None
|
|
616
|
+
if source_id:
|
|
617
|
+
source_def = REGISTRY.get(source_id)
|
|
618
|
+
if source_def:
|
|
619
|
+
logger.info(
|
|
620
|
+
"[PIPELINE:MANAGER] %s: Using source from source_id=%s: %s (enrichment_trigger=%s)",
|
|
621
|
+
self,
|
|
622
|
+
source_id,
|
|
623
|
+
source_def.display_name,
|
|
624
|
+
getattr(source_def, "enrichment_trigger", "not_set"),
|
|
625
|
+
)
|
|
626
|
+
else:
|
|
627
|
+
logger.warning(
|
|
628
|
+
"[PIPELINE:MANAGER] %s: source_id=%s not found in registry, falling back to schema_id lookup",
|
|
629
|
+
self,
|
|
630
|
+
source_id,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if not source_def:
|
|
634
|
+
# Fallback: find by schema_id (prefer file type for file ingestion)
|
|
635
|
+
for source in REGISTRY.values():
|
|
636
|
+
if source.schema_id == job.schema_id:
|
|
637
|
+
# Prefer file type sources for file ingestion
|
|
638
|
+
if source.source_type == "file":
|
|
639
|
+
source_def = source
|
|
640
|
+
logger.debug(
|
|
641
|
+
"[PIPELINE:MANAGER] %s: Found file source by schema_id: source_id=%s",
|
|
642
|
+
self,
|
|
643
|
+
source.source_id,
|
|
644
|
+
)
|
|
645
|
+
break
|
|
646
|
+
elif not source_def:
|
|
647
|
+
# Keep first match as fallback
|
|
648
|
+
source_def = source
|
|
649
|
+
if source_def:
|
|
650
|
+
logger.info(
|
|
651
|
+
"[PIPELINE:MANAGER] %s: Found source by schema_id: source_id=%s, source_type=%s, enrichment_trigger=%s",
|
|
652
|
+
self,
|
|
653
|
+
source_def.source_id,
|
|
654
|
+
source_def.source_type,
|
|
655
|
+
getattr(source_def, "enrichment_trigger", "not_set"),
|
|
656
|
+
)
|
|
657
|
+
else:
|
|
658
|
+
source_def = await _try_install_runtime_source_definition_from_control_plane(
|
|
659
|
+
source_id=source_id,
|
|
660
|
+
schema_id=job.schema_id,
|
|
661
|
+
user_id=_owner_user_id_from_dataset_id(job.dataset_id),
|
|
662
|
+
dataset_id=job.dataset_id,
|
|
663
|
+
progress_api_url=progress_api_url,
|
|
664
|
+
progress_api_key=progress_api_key,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Get canonical mapper if available
|
|
668
|
+
canonical_mapper = None
|
|
669
|
+
if source_def and source_def.canonical_mapper_id:
|
|
670
|
+
mapper_cls = MAPPER_REGISTRY.get(source_def.canonical_mapper_id)
|
|
671
|
+
if mapper_cls:
|
|
672
|
+
canonical_mapper = mapper_cls()
|
|
673
|
+
|
|
674
|
+
# Initialize enrichment orchestrator with a real connection, even outside app startup.
|
|
675
|
+
from ..core.state import get_db_connection
|
|
676
|
+
|
|
677
|
+
db_conn = get_db_connection()
|
|
678
|
+
tables_manager = DerivedTablesManager(conn=db_conn) if db_conn else None
|
|
679
|
+
enrichment_orchestrator = EnrichmentOrchestrator(tables_manager=tables_manager) if tables_manager else None
|
|
680
|
+
|
|
681
|
+
records_processed = 0
|
|
682
|
+
errors: list[dict] = []
|
|
683
|
+
last_record_id: Optional[str] = None
|
|
684
|
+
normalized_records: List[Any] = []
|
|
685
|
+
|
|
686
|
+
# Use TUI progress bar for better terminal display (single-line updates)
|
|
687
|
+
# If records_total is None, we'll update it as we go
|
|
688
|
+
pbar = None
|
|
689
|
+
if records_total:
|
|
690
|
+
pbar = ProgressBar(total=records_total, desc=f"{self}: Parsing")
|
|
691
|
+
else:
|
|
692
|
+
# Create progress bar with placeholder total, will update dynamically
|
|
693
|
+
pbar = ProgressBar(total=1000, desc=f"{self}: Parsing") # Placeholder, will adjust
|
|
694
|
+
|
|
695
|
+
try:
|
|
696
|
+
async for raw_payload in parse_file(_read_file_bytes(file_path), job.metadata.get("file_format", "jsonl")):
|
|
697
|
+
expanded_payloads = _expand_file_records(raw_payload, source_def)
|
|
698
|
+
if not expanded_payloads:
|
|
699
|
+
expanded_payloads = [raw_payload] if isinstance(raw_payload, dict) else []
|
|
700
|
+
for record_payload in expanded_payloads:
|
|
701
|
+
record_id = (
|
|
702
|
+
str(record_payload.get("id"))
|
|
703
|
+
or str(record_payload.get("message_id"))
|
|
704
|
+
or f"{records_processed + 1}"
|
|
705
|
+
)
|
|
706
|
+
raw_content = record_payload.get("content")
|
|
707
|
+
if isinstance(raw_content, str):
|
|
708
|
+
content_preview = raw_content[:100]
|
|
709
|
+
else:
|
|
710
|
+
content_preview = str(raw_content)[:100]
|
|
711
|
+
logger.debug(
|
|
712
|
+
"[PIPELINE:MANAGER] %s: Processing raw record: record_id=%s, content_preview=%s",
|
|
713
|
+
self,
|
|
714
|
+
record_id,
|
|
715
|
+
content_preview,
|
|
716
|
+
)
|
|
717
|
+
raw_record = RawRecord(record_id=record_id, payload=record_payload)
|
|
718
|
+
validation = parser.validate(raw_record)
|
|
719
|
+
if not validation.is_valid:
|
|
720
|
+
logger.debug(
|
|
721
|
+
"[PIPELINE:MANAGER] %s: Validation failed: record_id=%s, errors=%s",
|
|
722
|
+
self,
|
|
723
|
+
record_id,
|
|
724
|
+
validation.errors,
|
|
725
|
+
)
|
|
726
|
+
errors.append({"record_id": record_id, "errors": validation.errors})
|
|
727
|
+
if pbar:
|
|
728
|
+
pbar.update(1) # Still count invalid records
|
|
729
|
+
continue
|
|
730
|
+
normalized = parser.parse(raw_record)
|
|
731
|
+
logger.debug(
|
|
732
|
+
"[PIPELINE:NORMALIZED] Record normalized: record_id=%s, fields=%s",
|
|
733
|
+
normalized.record_id,
|
|
734
|
+
sorted(list(normalized.payload.keys()))[:12],
|
|
735
|
+
)
|
|
736
|
+
normalized_records.append(normalized)
|
|
737
|
+
records_processed += 1
|
|
738
|
+
last_record_id = record_id
|
|
739
|
+
progress.update(records_processed, current_step="parsing")
|
|
740
|
+
|
|
741
|
+
# Update progress bar
|
|
742
|
+
if pbar:
|
|
743
|
+
# If we didn't know total initially, update it now
|
|
744
|
+
if not records_total and records_processed > pbar.total:
|
|
745
|
+
# Estimate: assume we're at least 10% done, so total is at least 10x current
|
|
746
|
+
pbar.total = max(pbar.total, records_processed * 10)
|
|
747
|
+
pbar.update(1)
|
|
748
|
+
|
|
749
|
+
# Report progress to control plane if configured
|
|
750
|
+
if progress.should_report() and progress_api_url and progress_api_key:
|
|
751
|
+
try:
|
|
752
|
+
import httpx
|
|
753
|
+
progress_dict = progress.to_dict()
|
|
754
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
755
|
+
await client.post(
|
|
756
|
+
f"{progress_api_url}/v1/ingestion/progress",
|
|
757
|
+
json={
|
|
758
|
+
"job_id": job.job_id,
|
|
759
|
+
**progress_context,
|
|
760
|
+
**progress_dict,
|
|
761
|
+
},
|
|
762
|
+
headers={"Authorization": f"Bearer {progress_api_key}"},
|
|
763
|
+
)
|
|
764
|
+
except Exception as exc:
|
|
765
|
+
logger.warning("Failed to send ingestion progress update: %s", exc)
|
|
766
|
+
|
|
767
|
+
if progress.should_report():
|
|
768
|
+
logger.debug("Ingestion progress: %s", progress.to_dict())
|
|
769
|
+
except Exception:
|
|
770
|
+
# Re-raise exception but ensure progress bar state is preserved
|
|
771
|
+
raise
|
|
772
|
+
finally:
|
|
773
|
+
# Progress bar will be closed at the end of the function
|
|
774
|
+
pass
|
|
775
|
+
|
|
776
|
+
# Update progress bar: parsing complete, move to canonicalization
|
|
777
|
+
if pbar:
|
|
778
|
+
# Update total if we now know it
|
|
779
|
+
if not records_total and records_processed > 0:
|
|
780
|
+
pbar.total = records_processed
|
|
781
|
+
pbar.n = records_processed # Set current to match
|
|
782
|
+
pbar.set_description(f"{self}: Canonicalizing")
|
|
783
|
+
pbar._display() # Force display update
|
|
784
|
+
|
|
785
|
+
# Update progress: parsing complete
|
|
786
|
+
if progress_api_url and progress_api_key:
|
|
787
|
+
try:
|
|
788
|
+
import httpx
|
|
789
|
+
progress_dict = progress.to_dict()
|
|
790
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
791
|
+
await client.post(
|
|
792
|
+
f"{progress_api_url}/v1/ingestion/progress",
|
|
793
|
+
json={
|
|
794
|
+
"job_id": job.job_id,
|
|
795
|
+
**progress_context,
|
|
796
|
+
"current_step": "canonicalizing",
|
|
797
|
+
**progress_dict,
|
|
798
|
+
},
|
|
799
|
+
headers={"Authorization": f"Bearer {progress_api_key}"},
|
|
800
|
+
)
|
|
801
|
+
except Exception as exc:
|
|
802
|
+
logger.warning("Failed to send parsing complete progress: %s", exc)
|
|
803
|
+
|
|
804
|
+
# Persist parser output into source-defined logical tables when configured.
|
|
805
|
+
if source_def and db_conn and normalized_records:
|
|
806
|
+
try:
|
|
807
|
+
_persist_source_data_tables(
|
|
808
|
+
db_conn=db_conn,
|
|
809
|
+
source_def=source_def,
|
|
810
|
+
dataset_id=job.dataset_id,
|
|
811
|
+
normalized_records=normalized_records,
|
|
812
|
+
)
|
|
813
|
+
except Exception as exc:
|
|
814
|
+
logger.error(
|
|
815
|
+
"[PIPELINE:DATA_TABLE] %s: Failed to persist source table rows: %s",
|
|
816
|
+
self,
|
|
817
|
+
exc,
|
|
818
|
+
exc_info=True,
|
|
819
|
+
)
|
|
820
|
+
errors.append({"step": "source_data_table", "errors": [str(exc)]})
|
|
821
|
+
|
|
822
|
+
# Canonicalize normalized records: conversations group -> conversation_messages; else engine ai_chat_*
|
|
823
|
+
canonical_messages: List[Dict[str, Any]] = []
|
|
824
|
+
if source_def and normalized_records:
|
|
825
|
+
# Build staging records once (same shape for both paths)
|
|
826
|
+
staging_records = []
|
|
827
|
+
for normalized in normalized_records:
|
|
828
|
+
staging_record = {
|
|
829
|
+
"message_id": normalized.payload.get("message_id"),
|
|
830
|
+
"dataset_id": job.dataset_id,
|
|
831
|
+
"thread_id": normalized.payload.get("thread_id") or normalized.payload.get("conversation_id") or job.dataset_id,
|
|
832
|
+
"ts": normalized.payload.get("ts") or normalized.payload.get("created_at") or str(datetime.now(timezone.utc).timestamp()),
|
|
833
|
+
"sender_type": normalized.payload.get("sender_type"),
|
|
834
|
+
"content": normalized.payload.get("content"),
|
|
835
|
+
"source_id": source_def.source_id,
|
|
836
|
+
}
|
|
837
|
+
if "_metadata" in normalized.payload:
|
|
838
|
+
staging_record["_metadata"] = normalized.payload["_metadata"]
|
|
839
|
+
staging_records.append(staging_record)
|
|
840
|
+
|
|
841
|
+
canonical_group_id = getattr(source_def, "canonical_group_id", None)
|
|
842
|
+
if canonical_group_id == "conversations":
|
|
843
|
+
# Conversations canonical: write only to conversation_messages / conversations (never ai_chat_*)
|
|
844
|
+
from ..core.state import get_db_connection
|
|
845
|
+
from ..storage.canonical import ConversationsTablesManager
|
|
846
|
+
db_conn = get_db_connection()
|
|
847
|
+
if db_conn:
|
|
848
|
+
conv_manager = ConversationsTablesManager(db_conn)
|
|
849
|
+
canonical_result = conv_manager.upsert_message_batch(
|
|
850
|
+
staging_records, job.dataset_id, source_def.source_id
|
|
851
|
+
)
|
|
852
|
+
logger.debug(
|
|
853
|
+
"[PIPELINE:CANONICAL] %s: Conversations canonical: messages_created=%s, conversations_created=%s",
|
|
854
|
+
self,
|
|
855
|
+
canonical_result.get("messages_created", 0),
|
|
856
|
+
canonical_result.get("conversations_created", 0),
|
|
857
|
+
)
|
|
858
|
+
for staging_record in staging_records:
|
|
859
|
+
import json as _json
|
|
860
|
+
metadata_json = None
|
|
861
|
+
if "_metadata" in staging_record:
|
|
862
|
+
metadata_json = _json.dumps(staging_record["_metadata"])
|
|
863
|
+
canonical_messages.append({
|
|
864
|
+
"message_id": staging_record.get("message_id"),
|
|
865
|
+
"conversation_id": staging_record.get("thread_id") or staging_record.get("conversation_id") or job.dataset_id,
|
|
866
|
+
"sender_type": staging_record.get("sender_type"),
|
|
867
|
+
"sender_id": None,
|
|
868
|
+
"ts": staging_record.get("ts"),
|
|
869
|
+
"content": staging_record.get("content"),
|
|
870
|
+
"content_rendered": None,
|
|
871
|
+
"metadata_json": metadata_json,
|
|
872
|
+
"seq": 0,
|
|
873
|
+
"source_id": source_def.source_id,
|
|
874
|
+
})
|
|
875
|
+
elif source_def.canonical_mapper_id:
|
|
876
|
+
# Engine path: ai_chat_messages / ai_chat_conversations
|
|
877
|
+
try:
|
|
878
|
+
from ..storage.canonical.ai_chat import CanonicalTablesManager, Canonicalizer
|
|
879
|
+
from ..core.state import get_db_connection
|
|
880
|
+
|
|
881
|
+
db_conn = get_db_connection()
|
|
882
|
+
canonical_tables_manager = CanonicalTablesManager(db_conn) if db_conn else None
|
|
883
|
+
if canonical_tables_manager:
|
|
884
|
+
canonicalizer = Canonicalizer(canonical_tables_manager)
|
|
885
|
+
mapper_source = source_def.canonical_mapper_id
|
|
886
|
+
logger.debug(
|
|
887
|
+
"[PIPELINE:CANONICAL] %s: Canonicalizing %d records with mapper=%s (source_id=%s)",
|
|
888
|
+
self,
|
|
889
|
+
len(staging_records),
|
|
890
|
+
mapper_source,
|
|
891
|
+
source_def.source_id,
|
|
892
|
+
)
|
|
893
|
+
canonical_result = canonicalizer.canonicalize_staging_batch(
|
|
894
|
+
staging_records, source=mapper_source, batch_size=1000
|
|
895
|
+
)
|
|
896
|
+
# Enrichment should consume canonicalized rows (not pre-mapper staging rows).
|
|
897
|
+
mapped_messages = canonical_result.get("canonical_messages")
|
|
898
|
+
if isinstance(mapped_messages, list):
|
|
899
|
+
canonical_messages.extend(
|
|
900
|
+
[msg for msg in mapped_messages if isinstance(msg, dict)]
|
|
901
|
+
)
|
|
902
|
+
logger.debug(
|
|
903
|
+
"[PIPELINE:CANONICAL] %s: Canonicalization complete: messages_created=%s, conversations_created=%s, canonical_messages_count=%s",
|
|
904
|
+
self,
|
|
905
|
+
canonical_result.get("messages_created", 0),
|
|
906
|
+
canonical_result.get("conversations_created", 0),
|
|
907
|
+
len(canonical_messages),
|
|
908
|
+
)
|
|
909
|
+
else:
|
|
910
|
+
logger.warning("[PIPELINE:CANONICAL] %s: No database connection, skipping canonicalization", self)
|
|
911
|
+
except ImportError as e:
|
|
912
|
+
logger.warning("[PIPELINE:CANONICAL] %s: Canonicalization modules not available: %s. Using fallback mapper.", self, e)
|
|
913
|
+
if canonical_mapper:
|
|
914
|
+
for normalized in normalized_records:
|
|
915
|
+
try:
|
|
916
|
+
canonical = canonical_mapper.map(normalized)
|
|
917
|
+
if source_def:
|
|
918
|
+
canonical.payload["source_id"] = source_def.source_id
|
|
919
|
+
canonical_messages.append(canonical.payload)
|
|
920
|
+
except Exception as exc:
|
|
921
|
+
logger.error("[PIPELINE:CANONICAL] %s: Failed to canonicalize record %s: %s", self, normalized.record_id, exc)
|
|
922
|
+
errors.append({"record_id": normalized.record_id, "errors": [str(exc)]})
|
|
923
|
+
except Exception as exc:
|
|
924
|
+
logger.error("[PIPELINE:CANONICAL] %s: Failed to canonicalize records: %s", self, exc, exc_info=True)
|
|
925
|
+
errors.append({"step": "canonicalization", "errors": [str(exc)]})
|
|
926
|
+
|
|
927
|
+
# Run enrichment on canonical messages (only if automatic trigger)
|
|
928
|
+
if canonical_messages and source_def and source_def.canonical_enrichment_jobs:
|
|
929
|
+
# Get enrichment trigger - explicitly check attribute, default to "automatic" if not set
|
|
930
|
+
enrichment_trigger = getattr(source_def, "enrichment_trigger", "automatic")
|
|
931
|
+
|
|
932
|
+
logger.info(
|
|
933
|
+
"[PIPELINE:ENRICHMENT] %s: Enrichment trigger check: source_id=%s, enrichment_trigger=%s, canonical_messages=%d, jobs=%s",
|
|
934
|
+
self,
|
|
935
|
+
source_def.source_id if source_def else "unknown",
|
|
936
|
+
enrichment_trigger,
|
|
937
|
+
len(canonical_messages),
|
|
938
|
+
source_def.canonical_enrichment_jobs,
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
# Explicitly check for "manual" trigger - skip enrichment if manual
|
|
942
|
+
if enrichment_trigger == "manual":
|
|
943
|
+
logger.info(
|
|
944
|
+
"[PIPELINE:ENRICHMENT] %s: ✅ SKIPPING enrichment (manual trigger): %d canonical messages will be enriched later via POST /v1/enrichment/process",
|
|
945
|
+
self,
|
|
946
|
+
len(canonical_messages),
|
|
947
|
+
)
|
|
948
|
+
# Do NOT run enrichment - return early from this block
|
|
949
|
+
elif enrichment_trigger == "automatic":
|
|
950
|
+
# Only run enrichment if explicitly set to "automatic"
|
|
951
|
+
logger.info(
|
|
952
|
+
"[PIPELINE:ENRICHMENT] %s: Running enrichment (automatic trigger)",
|
|
953
|
+
self,
|
|
954
|
+
)
|
|
955
|
+
# Automatic trigger - run enrichment now
|
|
956
|
+
# Filter out messages that are already enriched
|
|
957
|
+
unenriched_messages = _filter_unenriched_messages(
|
|
958
|
+
canonical_messages,
|
|
959
|
+
source_def.canonical_enrichment_jobs,
|
|
960
|
+
tables_manager,
|
|
961
|
+
source_id=source_def.source_id,
|
|
962
|
+
dataset_id=job.dataset_id,
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
if not unenriched_messages:
|
|
966
|
+
logger.debug(
|
|
967
|
+
"[PIPELINE:ENRICHMENT] %s: All %d messages already enriched, skipping",
|
|
968
|
+
self,
|
|
969
|
+
len(canonical_messages),
|
|
970
|
+
)
|
|
971
|
+
else:
|
|
972
|
+
if not enrichment_orchestrator:
|
|
973
|
+
logger.error(
|
|
974
|
+
"[PIPELINE:ENRICHMENT] %s: Cannot run enrichment - enrichment_orchestrator not initialized",
|
|
975
|
+
self,
|
|
976
|
+
)
|
|
977
|
+
errors.append({"step": "enrichment", "errors": ["Enrichment orchestrator not initialized"]})
|
|
978
|
+
else:
|
|
979
|
+
logger.info(
|
|
980
|
+
"[PIPELINE:ENRICHMENT] %s → %s: Starting enrichment (automatic): %d new messages (out of %d total), jobs=%s",
|
|
981
|
+
self,
|
|
982
|
+
enrichment_orchestrator,
|
|
983
|
+
len(unenriched_messages),
|
|
984
|
+
len(canonical_messages),
|
|
985
|
+
source_def.canonical_enrichment_jobs,
|
|
986
|
+
)
|
|
987
|
+
try:
|
|
988
|
+
enrichment_result = await enrichment_orchestrator.run_canonical(
|
|
989
|
+
unenriched_messages,
|
|
990
|
+
job_names=source_def.canonical_enrichment_jobs,
|
|
991
|
+
)
|
|
992
|
+
logger.info(
|
|
993
|
+
"[PIPELINE:ENRICHMENT] %s → %s: Enrichment complete: jobs_run=%s, records_created=%s, errors=%s",
|
|
994
|
+
self,
|
|
995
|
+
enrichment_orchestrator,
|
|
996
|
+
enrichment_result.get("jobs_run"),
|
|
997
|
+
enrichment_result.get("records_created"),
|
|
998
|
+
len(enrichment_result.get("errors", [])),
|
|
999
|
+
)
|
|
1000
|
+
if enrichment_result.get("errors"):
|
|
1001
|
+
errors.extend(enrichment_result["errors"])
|
|
1002
|
+
except Exception as exc:
|
|
1003
|
+
logger.error(
|
|
1004
|
+
"[PIPELINE:ENRICHMENT] %s → %s: Enrichment failed: %s",
|
|
1005
|
+
self,
|
|
1006
|
+
enrichment_orchestrator,
|
|
1007
|
+
exc,
|
|
1008
|
+
exc_info=True,
|
|
1009
|
+
)
|
|
1010
|
+
errors.append({"step": "enrichment", "errors": [str(exc)]})
|
|
1011
|
+
|
|
1012
|
+
if self.checkpoint_store and last_record_id:
|
|
1013
|
+
checkpoint = IngestionCheckpoint(
|
|
1014
|
+
dataset_id=job.dataset_id,
|
|
1015
|
+
schema_id=job.schema_id,
|
|
1016
|
+
last_record_id=last_record_id,
|
|
1017
|
+
metadata={"file_path": str(file_path)},
|
|
1018
|
+
)
|
|
1019
|
+
self.checkpoint_store.save_checkpoint(checkpoint)
|
|
1020
|
+
|
|
1021
|
+
logger.debug(
|
|
1022
|
+
"[PIPELINE:MANAGER] %s: Job complete: job_id=%s, records_processed=%s, errors_count=%s, last_record_id=%s",
|
|
1023
|
+
self,
|
|
1024
|
+
job.job_id,
|
|
1025
|
+
records_processed,
|
|
1026
|
+
len(errors),
|
|
1027
|
+
last_record_id,
|
|
1028
|
+
)
|
|
1029
|
+
|
|
1030
|
+
# Update progress: set records_total if we now know it
|
|
1031
|
+
if not progress.records_total and records_processed > 0:
|
|
1032
|
+
progress.records_total = records_processed
|
|
1033
|
+
|
|
1034
|
+
# Finalize progress bar
|
|
1035
|
+
if pbar:
|
|
1036
|
+
# Ensure progress bar shows 100%
|
|
1037
|
+
if records_processed > 0:
|
|
1038
|
+
pbar.total = records_processed
|
|
1039
|
+
pbar.n = records_processed
|
|
1040
|
+
pbar.set_description(f"{self}: Complete")
|
|
1041
|
+
pbar._display()
|
|
1042
|
+
pbar.close()
|
|
1043
|
+
|
|
1044
|
+
# Send final progress update
|
|
1045
|
+
if progress_api_url and progress_api_key:
|
|
1046
|
+
try:
|
|
1047
|
+
import httpx
|
|
1048
|
+
progress_dict = progress.to_dict()
|
|
1049
|
+
progress_dict["progress_percent"] = 100.0 # Ensure 100% on completion
|
|
1050
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
1051
|
+
await client.post(
|
|
1052
|
+
f"{progress_api_url}/v1/ingestion/progress",
|
|
1053
|
+
json={
|
|
1054
|
+
"job_id": job.job_id,
|
|
1055
|
+
**progress_context,
|
|
1056
|
+
"status": "completed",
|
|
1057
|
+
"current_step": "completed",
|
|
1058
|
+
**progress_dict,
|
|
1059
|
+
},
|
|
1060
|
+
headers={"Authorization": f"Bearer {progress_api_key}"},
|
|
1061
|
+
)
|
|
1062
|
+
except Exception as exc:
|
|
1063
|
+
logger.warning("Failed to send final ingestion progress: %s", exc)
|
|
1064
|
+
|
|
1065
|
+
file_size_bytes = 0
|
|
1066
|
+
try:
|
|
1067
|
+
file_size_bytes = int(file_path.stat().st_size)
|
|
1068
|
+
except Exception:
|
|
1069
|
+
file_size_bytes = 0
|
|
1070
|
+
quantity_mb = int((max(0, file_size_bytes) + (1024 * 1024) - 1) // (1024 * 1024))
|
|
1071
|
+
await emit_usage_observation(
|
|
1072
|
+
action="ingestion.file_processed",
|
|
1073
|
+
quantity=quantity_mb,
|
|
1074
|
+
producer="ingestion.manager",
|
|
1075
|
+
canonical_action_identity={
|
|
1076
|
+
"job_id": job.job_id,
|
|
1077
|
+
"dataset_id": job.dataset_id,
|
|
1078
|
+
"schema_id": job.schema_id,
|
|
1079
|
+
"source_id": source_id or "",
|
|
1080
|
+
"records_processed": records_processed,
|
|
1081
|
+
},
|
|
1082
|
+
topos_id=job.dataset_id,
|
|
1083
|
+
trust_class="cp_observed_self_hosted",
|
|
1084
|
+
metadata={"file_size_bytes": file_size_bytes, "quantity_mb": quantity_mb},
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
# Include progress information in return (for progress bar)
|
|
1088
|
+
progress_dict = progress.to_dict()
|
|
1089
|
+
|
|
1090
|
+
return {
|
|
1091
|
+
"job_id": job.job_id,
|
|
1092
|
+
"records_processed": records_processed,
|
|
1093
|
+
"errors_count": len(errors),
|
|
1094
|
+
"errors": errors[:100],
|
|
1095
|
+
# Include progress for progress bar
|
|
1096
|
+
"progress_percent": progress_dict.get("progress_percent", 0.0),
|
|
1097
|
+
"records_total": progress_dict.get("records_total"),
|
|
1098
|
+
"estimated_seconds_remaining": progress_dict.get("estimated_seconds_remaining"),
|
|
1099
|
+
"current_step": progress_dict.get("current_step"),
|
|
1100
|
+
}
|