topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,823 @@
|
|
|
1
|
+
"""Local sync ingestion: iMessage, Signal (read from local DB, write to conversation_messages)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sqlite3
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from .checkpoints.checkpoint_store import CheckpointStore, IngestionCheckpoint
|
|
11
|
+
from .checkpoints.sqlite_checkpoint_store import SqliteCheckpointStore
|
|
12
|
+
from .parsers import PARSER_REGISTRY
|
|
13
|
+
from .sources.base import RawRecord
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("topos.ingestion.local_sync")
|
|
16
|
+
|
|
17
|
+
IMESSAGE_SCHEMA_ID = "imessage.messages.v1"
|
|
18
|
+
SOURCE_ID_IMESSAGE = "imessage"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _run_local_sync_enrichment_if_enabled(
|
|
22
|
+
*,
|
|
23
|
+
db_conn: Any,
|
|
24
|
+
source_id: str,
|
|
25
|
+
canonical_messages: List[Dict[str, Any]],
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Run canonical enrichment for local_sync sources when trigger is automatic."""
|
|
28
|
+
if not canonical_messages:
|
|
29
|
+
return
|
|
30
|
+
try:
|
|
31
|
+
from ..sources.registry import REGISTRY
|
|
32
|
+
source_def = REGISTRY.get(source_id)
|
|
33
|
+
if not source_def:
|
|
34
|
+
return
|
|
35
|
+
if getattr(source_def, "enrichment_trigger", "manual") != "automatic":
|
|
36
|
+
return
|
|
37
|
+
job_names = list(getattr(source_def, "canonical_enrichment_jobs", []) or [])
|
|
38
|
+
if not job_names:
|
|
39
|
+
return
|
|
40
|
+
from ..enrichment.derived_tables import DerivedTablesManager
|
|
41
|
+
from ..enrichment.orchestrator import EnrichmentOrchestrator
|
|
42
|
+
import asyncio as _asyncio
|
|
43
|
+
|
|
44
|
+
orchestrator = EnrichmentOrchestrator(tables_manager=DerivedTablesManager(conn=db_conn))
|
|
45
|
+
_asyncio.run(orchestrator.run_canonical(canonical_messages, job_names=job_names))
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logger.warning(
|
|
48
|
+
"[PIPELINE:ENRICHMENT] local_sync enrichment failed (non-fatal): source_id=%s error=%s",
|
|
49
|
+
source_id,
|
|
50
|
+
e,
|
|
51
|
+
exc_info=True,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _resolve_sync_start_unix(options: Optional[Dict[str, Any]]) -> tuple[Optional[float], Optional[str]]:
|
|
56
|
+
"""Resolve sync start timestamp from sync options."""
|
|
57
|
+
if not options:
|
|
58
|
+
return None, None
|
|
59
|
+
mode = str(options.get("mode") or "all").strip().lower()
|
|
60
|
+
if mode in {"", "all"}:
|
|
61
|
+
return None, None
|
|
62
|
+
now = datetime.now(timezone.utc)
|
|
63
|
+
if mode == "1m":
|
|
64
|
+
return (now - timedelta(days=30)).timestamp(), None
|
|
65
|
+
if mode == "3m":
|
|
66
|
+
return (now - timedelta(days=90)).timestamp(), None
|
|
67
|
+
if mode == "6m":
|
|
68
|
+
return (now - timedelta(days=180)).timestamp(), None
|
|
69
|
+
if mode == "1y":
|
|
70
|
+
return (now - timedelta(days=365)).timestamp(), None
|
|
71
|
+
if mode == "5y":
|
|
72
|
+
return (now - timedelta(days=365 * 5)).timestamp(), None
|
|
73
|
+
if mode == "custom":
|
|
74
|
+
start_raw = options.get("start_date")
|
|
75
|
+
if not start_raw:
|
|
76
|
+
return None, "start_date is required for custom sync mode"
|
|
77
|
+
try:
|
|
78
|
+
start_text = str(start_raw).strip()
|
|
79
|
+
if len(start_text) == 10:
|
|
80
|
+
dt = datetime.fromisoformat(start_text).replace(tzinfo=timezone.utc)
|
|
81
|
+
else:
|
|
82
|
+
dt = datetime.fromisoformat(start_text.replace("Z", "+00:00"))
|
|
83
|
+
if dt.tzinfo is None:
|
|
84
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
85
|
+
else:
|
|
86
|
+
dt = dt.astimezone(timezone.utc)
|
|
87
|
+
return dt.timestamp(), None
|
|
88
|
+
except Exception:
|
|
89
|
+
return None, f"invalid start_date: {start_raw}"
|
|
90
|
+
return None, f"unknown sync mode: {mode}"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _map_normalized_records_with_canonical_mapper(
|
|
94
|
+
normalized_records: List[Any],
|
|
95
|
+
*,
|
|
96
|
+
source_id: str,
|
|
97
|
+
) -> List[Dict[str, Any]]:
|
|
98
|
+
"""Map normalized records through source canonical mapper (with fallback)."""
|
|
99
|
+
try:
|
|
100
|
+
from ..canonicalization.mappers import MAPPER_REGISTRY
|
|
101
|
+
from ..sources.registry import REGISTRY
|
|
102
|
+
|
|
103
|
+
source_def = REGISTRY.get(source_id)
|
|
104
|
+
mapper_id = getattr(source_def, "canonical_mapper_id", None) if source_def else None
|
|
105
|
+
mapper_cls = MAPPER_REGISTRY.get(mapper_id) if mapper_id else None
|
|
106
|
+
if not mapper_cls:
|
|
107
|
+
raise ValueError(f"No canonical mapper registered for source_id={source_id} mapper_id={mapper_id}")
|
|
108
|
+
mapper = mapper_cls()
|
|
109
|
+
out: List[Dict[str, Any]] = []
|
|
110
|
+
for norm in normalized_records:
|
|
111
|
+
canonical = mapper.map(norm)
|
|
112
|
+
payload = dict(canonical.payload or {})
|
|
113
|
+
payload["source_id"] = source_id
|
|
114
|
+
out.append(payload)
|
|
115
|
+
return out
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.warning(
|
|
118
|
+
"[PIPELINE:CANONICAL] local_sync mapper unavailable for source_id=%s, using fallback payload mapping: %s",
|
|
119
|
+
source_id,
|
|
120
|
+
e,
|
|
121
|
+
)
|
|
122
|
+
out: List[Dict[str, Any]] = []
|
|
123
|
+
for norm in normalized_records:
|
|
124
|
+
p = dict(getattr(norm, "payload", {}) or {})
|
|
125
|
+
if not p.get("message_id"):
|
|
126
|
+
p["message_id"] = getattr(norm, "record_id", None)
|
|
127
|
+
if not p.get("conversation_id"):
|
|
128
|
+
p["conversation_id"] = p.get("thread_id")
|
|
129
|
+
p["source_id"] = source_id
|
|
130
|
+
out.append(p)
|
|
131
|
+
return out
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _signal_reply_source_key_to_seconds(source_key: Any) -> Optional[int]:
|
|
135
|
+
"""Normalize Signal reply source key variants to Unix seconds for lookup."""
|
|
136
|
+
if source_key is None:
|
|
137
|
+
return None
|
|
138
|
+
text = str(source_key).strip()
|
|
139
|
+
if not text:
|
|
140
|
+
return None
|
|
141
|
+
if text.startswith("signal:"):
|
|
142
|
+
parts = text.split(":")
|
|
143
|
+
if len(parts) >= 3:
|
|
144
|
+
try:
|
|
145
|
+
return int(float(parts[-1]))
|
|
146
|
+
except Exception:
|
|
147
|
+
return None
|
|
148
|
+
try:
|
|
149
|
+
value = int(float(text))
|
|
150
|
+
except Exception:
|
|
151
|
+
return None
|
|
152
|
+
# Common Signal quote.id style is milliseconds.
|
|
153
|
+
if abs(value) >= 1_000_000_000_000:
|
|
154
|
+
return int(value / 1000)
|
|
155
|
+
return value
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _resolve_signal_reply_links(
|
|
159
|
+
*,
|
|
160
|
+
db_conn: Any,
|
|
161
|
+
dataset_id: str,
|
|
162
|
+
staging_records: List[Dict[str, Any]],
|
|
163
|
+
) -> None:
|
|
164
|
+
"""Resolve Signal reply source keys to canonical message_id when possible.
|
|
165
|
+
|
|
166
|
+
This mutates staging_records in-place:
|
|
167
|
+
- preserves original source reply key in _metadata.reply_to_source_key
|
|
168
|
+
- updates reply_to_message_id to canonical message_id when matched
|
|
169
|
+
"""
|
|
170
|
+
if not staging_records:
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
# Build in-batch lookup by (conversation/thread id, sent_at_seconds) -> message_id.
|
|
174
|
+
batch_lookup: Dict[tuple[str, int], str] = {}
|
|
175
|
+
for rec in staging_records:
|
|
176
|
+
message_id = str(rec.get("message_id") or "")
|
|
177
|
+
thread_id = str(rec.get("thread_id") or rec.get("conversation_id") or "")
|
|
178
|
+
if not message_id or not thread_id:
|
|
179
|
+
continue
|
|
180
|
+
sec = _signal_reply_source_key_to_seconds(message_id)
|
|
181
|
+
if sec is not None:
|
|
182
|
+
batch_lookup[(thread_id, sec)] = message_id
|
|
183
|
+
|
|
184
|
+
for rec in staging_records:
|
|
185
|
+
source_key = rec.get("reply_to_message_id")
|
|
186
|
+
if source_key is None:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# Always preserve source-native linkage in metadata for traceability.
|
|
190
|
+
if "_metadata" not in rec or not isinstance(rec.get("_metadata"), dict):
|
|
191
|
+
rec["_metadata"] = {}
|
|
192
|
+
rec["_metadata"]["reply_to_source_key"] = source_key
|
|
193
|
+
|
|
194
|
+
source_key_text = str(source_key).strip()
|
|
195
|
+
if not source_key_text:
|
|
196
|
+
rec["reply_to_message_id"] = None
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Already canonical format.
|
|
200
|
+
if source_key_text.startswith("signal:"):
|
|
201
|
+
rec["reply_to_message_id"] = source_key_text
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
thread_id = str(rec.get("thread_id") or rec.get("conversation_id") or "")
|
|
205
|
+
sec = _signal_reply_source_key_to_seconds(source_key_text)
|
|
206
|
+
resolved: Optional[str] = None
|
|
207
|
+
|
|
208
|
+
if sec is not None and thread_id:
|
|
209
|
+
resolved = batch_lookup.get((thread_id, sec))
|
|
210
|
+
|
|
211
|
+
# Fallback lookup in already-ingested canonical rows.
|
|
212
|
+
if resolved is None and sec is not None and thread_id and db_conn is not None:
|
|
213
|
+
like_suffix = f"%:{sec}"
|
|
214
|
+
row = db_conn.execute(
|
|
215
|
+
"""
|
|
216
|
+
SELECT message_id
|
|
217
|
+
FROM conversation_messages
|
|
218
|
+
WHERE dataset_id = ?
|
|
219
|
+
AND source_id = 'signal'
|
|
220
|
+
AND conversation_id = ?
|
|
221
|
+
AND message_id LIKE ?
|
|
222
|
+
ORDER BY event_at DESC
|
|
223
|
+
LIMIT 1
|
|
224
|
+
""",
|
|
225
|
+
(dataset_id, thread_id, like_suffix),
|
|
226
|
+
).fetchone()
|
|
227
|
+
if row:
|
|
228
|
+
resolved = row[0]
|
|
229
|
+
|
|
230
|
+
# Store canonical link when matched; otherwise keep source key for now.
|
|
231
|
+
if resolved:
|
|
232
|
+
rec["reply_to_message_id"] = resolved
|
|
233
|
+
else:
|
|
234
|
+
rec["reply_to_message_id"] = source_key_text
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _backfill_signal_reply_links_in_db(*, db_conn: Any, dataset_id: str) -> int:
|
|
238
|
+
"""Resolve persisted Signal reply keys (ms/sec source keys -> canonical message_id)."""
|
|
239
|
+
if db_conn is None:
|
|
240
|
+
return 0
|
|
241
|
+
updated = 0
|
|
242
|
+
rows = db_conn.execute(
|
|
243
|
+
"""
|
|
244
|
+
SELECT message_id, conversation_id, reply_to_message_id
|
|
245
|
+
FROM conversation_messages
|
|
246
|
+
WHERE dataset_id = ?
|
|
247
|
+
AND source_id = 'signal'
|
|
248
|
+
AND reply_to_message_id IS NOT NULL
|
|
249
|
+
AND reply_to_message_id != ''
|
|
250
|
+
AND reply_to_message_id NOT LIKE 'signal:%'
|
|
251
|
+
""",
|
|
252
|
+
(dataset_id,),
|
|
253
|
+
).fetchall()
|
|
254
|
+
for row in rows:
|
|
255
|
+
row_message_id, conversation_id, reply_key = row
|
|
256
|
+
sec = _signal_reply_source_key_to_seconds(reply_key)
|
|
257
|
+
if sec is None:
|
|
258
|
+
continue
|
|
259
|
+
target = db_conn.execute(
|
|
260
|
+
"""
|
|
261
|
+
SELECT message_id
|
|
262
|
+
FROM conversation_messages
|
|
263
|
+
WHERE dataset_id = ?
|
|
264
|
+
AND source_id = 'signal'
|
|
265
|
+
AND conversation_id = ?
|
|
266
|
+
AND message_id LIKE ?
|
|
267
|
+
ORDER BY event_at DESC
|
|
268
|
+
LIMIT 1
|
|
269
|
+
""",
|
|
270
|
+
(dataset_id, conversation_id, f"%:{sec}"),
|
|
271
|
+
).fetchone()
|
|
272
|
+
if not target:
|
|
273
|
+
continue
|
|
274
|
+
resolved_message_id = target[0]
|
|
275
|
+
if not resolved_message_id or resolved_message_id == row_message_id:
|
|
276
|
+
continue
|
|
277
|
+
db_conn.execute(
|
|
278
|
+
"""
|
|
279
|
+
UPDATE conversation_messages
|
|
280
|
+
SET reply_to_message_id = ?
|
|
281
|
+
WHERE message_id = ?
|
|
282
|
+
""",
|
|
283
|
+
(resolved_message_id, row_message_id),
|
|
284
|
+
)
|
|
285
|
+
updated += 1
|
|
286
|
+
if updated:
|
|
287
|
+
db_conn.commit()
|
|
288
|
+
return updated
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def run_imessage_sync(
|
|
292
|
+
dataset_id: str,
|
|
293
|
+
*,
|
|
294
|
+
checkpoint_store: Optional[CheckpointStore] = None,
|
|
295
|
+
db_conn: Optional[Any] = None,
|
|
296
|
+
chat_db_path: Optional[Any] = None,
|
|
297
|
+
batch_size: int = 5000,
|
|
298
|
+
sync_options: Optional[Dict[str, Any]] = None,
|
|
299
|
+
) -> Dict[str, Any]:
|
|
300
|
+
"""
|
|
301
|
+
Run iMessage sync: load checkpoint → read from chat.db → parse → write to conversation_messages → save checkpoint.
|
|
302
|
+
Returns dict with status, records_processed, last_record_id, error (if any).
|
|
303
|
+
"""
|
|
304
|
+
if not dataset_id:
|
|
305
|
+
return {"status": "error", "error": "dataset_id required", "records_processed": 0}
|
|
306
|
+
|
|
307
|
+
if db_conn is None:
|
|
308
|
+
from ..core.state import get_db_connection
|
|
309
|
+
db_conn = get_db_connection()
|
|
310
|
+
if db_conn is None:
|
|
311
|
+
return {"status": "error", "error": "Database connection not available", "records_processed": 0}
|
|
312
|
+
|
|
313
|
+
store = checkpoint_store if checkpoint_store is not None else SqliteCheckpointStore(db_conn)
|
|
314
|
+
checkpoint = store.get_checkpoint(dataset_id, IMESSAGE_SCHEMA_ID)
|
|
315
|
+
last_record_id = checkpoint.last_record_id if checkpoint else "0"
|
|
316
|
+
|
|
317
|
+
logger.info(
|
|
318
|
+
"run_imessage_sync starting: dataset_id=%s last_record_id=%s",
|
|
319
|
+
dataset_id[:24] + "..." if len(dataset_id) > 24 else dataset_id,
|
|
320
|
+
last_record_id[:20] + "..." if last_record_id and len(last_record_id) > 20 else last_record_id,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
return _run_imessage_sync_impl(
|
|
325
|
+
dataset_id=dataset_id,
|
|
326
|
+
db_conn=db_conn,
|
|
327
|
+
store=store,
|
|
328
|
+
last_record_id=last_record_id,
|
|
329
|
+
chat_db_path=chat_db_path,
|
|
330
|
+
batch_size=batch_size,
|
|
331
|
+
sync_options=sync_options,
|
|
332
|
+
)
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.warning(
|
|
335
|
+
"run_imessage_sync failed (top-level catch): %s",
|
|
336
|
+
e,
|
|
337
|
+
exc_info=True,
|
|
338
|
+
)
|
|
339
|
+
return {"status": "error", "error": str(e), "records_processed": 0}
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _run_imessage_sync_impl(
|
|
343
|
+
dataset_id: str,
|
|
344
|
+
*,
|
|
345
|
+
db_conn: Any,
|
|
346
|
+
store: CheckpointStore,
|
|
347
|
+
last_record_id: str,
|
|
348
|
+
chat_db_path: Optional[Any] = None,
|
|
349
|
+
batch_size: int = 5000,
|
|
350
|
+
sync_options: Optional[Dict[str, Any]] = None,
|
|
351
|
+
) -> Dict[str, Any]:
|
|
352
|
+
"""Implementation of run_imessage_sync (called inside try so we never raise)."""
|
|
353
|
+
start_unix, start_error = _resolve_sync_start_unix(sync_options)
|
|
354
|
+
if start_error:
|
|
355
|
+
return {"status": "error", "error": start_error, "records_processed": 0}
|
|
356
|
+
|
|
357
|
+
parser_cls = PARSER_REGISTRY.get(IMESSAGE_SCHEMA_ID)
|
|
358
|
+
if not parser_cls:
|
|
359
|
+
return {"status": "error", "error": "No parser for imessage.messages.v1", "records_processed": 0}
|
|
360
|
+
parser = parser_cls(dataset_id=dataset_id, _schema_id=IMESSAGE_SCHEMA_ID)
|
|
361
|
+
from ..storage.canonical import ConversationsTablesManager
|
|
362
|
+
manager = ConversationsTablesManager(db_conn)
|
|
363
|
+
from .sources.imessage_reader import read_imessage_rows_list, get_chat_db_path
|
|
364
|
+
path = chat_db_path or get_chat_db_path()
|
|
365
|
+
|
|
366
|
+
# For bounded history sync, restart from row 0 and apply time filter.
|
|
367
|
+
current_last_record_id = "0" if start_unix is not None else last_record_id
|
|
368
|
+
final_last_record_id = last_record_id
|
|
369
|
+
total_processed = 0
|
|
370
|
+
batch_num = 0
|
|
371
|
+
|
|
372
|
+
while True:
|
|
373
|
+
batch_num += 1
|
|
374
|
+
try:
|
|
375
|
+
rows = read_imessage_rows_list(
|
|
376
|
+
last_rowid=current_last_record_id if current_last_record_id != "0" else None,
|
|
377
|
+
chat_db_path=path,
|
|
378
|
+
batch_size=batch_size,
|
|
379
|
+
start_unix=start_unix,
|
|
380
|
+
)
|
|
381
|
+
except FileNotFoundError as e:
|
|
382
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
383
|
+
except PermissionError as e:
|
|
384
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
385
|
+
except OSError as e:
|
|
386
|
+
logger.warning(
|
|
387
|
+
"imessage read failed (OSError errno=%s) on batch %d: %s",
|
|
388
|
+
getattr(e, "errno", None),
|
|
389
|
+
batch_num,
|
|
390
|
+
e,
|
|
391
|
+
exc_info=True,
|
|
392
|
+
)
|
|
393
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
394
|
+
except sqlite3.Error as e:
|
|
395
|
+
logger.warning(
|
|
396
|
+
"imessage read failed (sqlite3.Error) on batch %d: %s",
|
|
397
|
+
batch_num,
|
|
398
|
+
e,
|
|
399
|
+
exc_info=True,
|
|
400
|
+
)
|
|
401
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.warning("imessage read failed on batch %d: %s", batch_num, e, exc_info=True)
|
|
404
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
405
|
+
|
|
406
|
+
if not rows:
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
# Persist raw iMessage payloads for traceability and debugging (non-fatal on failure).
|
|
410
|
+
try:
|
|
411
|
+
from ..storage.raw.raw_tables_manager import RawTablesManager
|
|
412
|
+
raw_tables_manager = RawTablesManager(db_conn)
|
|
413
|
+
for row in rows:
|
|
414
|
+
raw_tables_manager.write_raw_record(
|
|
415
|
+
source_id=SOURCE_ID_IMESSAGE,
|
|
416
|
+
source_record_id=str(row.get("id") or ""),
|
|
417
|
+
payload=row,
|
|
418
|
+
source_type="chat_messages",
|
|
419
|
+
)
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.warning("[PIPELINE:RAW] iMessage raw write failed (non-fatal): %s", e)
|
|
422
|
+
|
|
423
|
+
normalized_records: List[Any] = []
|
|
424
|
+
max_rowid: Optional[int] = None
|
|
425
|
+
for row in rows:
|
|
426
|
+
raw = RawRecord(record_id=row["id"], payload=row)
|
|
427
|
+
validation = parser.validate(raw)
|
|
428
|
+
if not validation.is_valid:
|
|
429
|
+
logger.debug("Skip invalid row: %s", validation.errors)
|
|
430
|
+
continue
|
|
431
|
+
norm = parser.parse(raw)
|
|
432
|
+
normalized_records.append(norm)
|
|
433
|
+
rid = row.get("ROWID")
|
|
434
|
+
if rid is not None and (max_rowid is None or rid > max_rowid):
|
|
435
|
+
max_rowid = rid
|
|
436
|
+
|
|
437
|
+
if normalized_records:
|
|
438
|
+
mapped_records = _map_normalized_records_with_canonical_mapper(
|
|
439
|
+
normalized_records,
|
|
440
|
+
source_id=SOURCE_ID_IMESSAGE,
|
|
441
|
+
)
|
|
442
|
+
staging_records: List[Dict[str, Any]] = []
|
|
443
|
+
for rec in mapped_records:
|
|
444
|
+
thread_id = rec.get("thread_id") or rec.get("conversation_id") or dataset_id
|
|
445
|
+
is_self = str(rec.get("sender_id") or "").strip().lower() == "self"
|
|
446
|
+
staging = {
|
|
447
|
+
"message_id": rec.get("message_id"),
|
|
448
|
+
"dataset_id": dataset_id,
|
|
449
|
+
"thread_id": thread_id,
|
|
450
|
+
"ts": rec.get("ts") or datetime.now(timezone.utc).isoformat(),
|
|
451
|
+
"sender_type": rec.get("sender_type", "human"),
|
|
452
|
+
"sender_id": rec.get("sender_id"),
|
|
453
|
+
"from_self": is_self,
|
|
454
|
+
"reply_to_message_id": rec.get("reply_to_message_id"),
|
|
455
|
+
"message_type": rec.get("message_type"),
|
|
456
|
+
"event_type": rec.get("event_type"),
|
|
457
|
+
"content": rec.get("content"),
|
|
458
|
+
"source_id": SOURCE_ID_IMESSAGE,
|
|
459
|
+
}
|
|
460
|
+
if "_metadata" in rec:
|
|
461
|
+
staging["_metadata"] = rec["_metadata"]
|
|
462
|
+
staging_records.append(staging)
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
manager.upsert_message_batch(staging_records, dataset_id, SOURCE_ID_IMESSAGE)
|
|
466
|
+
except Exception as e:
|
|
467
|
+
logger.exception("ConversationsTablesManager.upsert_message_batch failed")
|
|
468
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
469
|
+
|
|
470
|
+
canonical_messages = [
|
|
471
|
+
{
|
|
472
|
+
"message_id": rec.get("message_id"),
|
|
473
|
+
"conversation_id": rec.get("thread_id") or dataset_id,
|
|
474
|
+
"sender_type": rec.get("sender_type"),
|
|
475
|
+
"sender_id": rec.get("sender_id"),
|
|
476
|
+
"reply_to_message_id": rec.get("reply_to_message_id"),
|
|
477
|
+
"message_type": rec.get("message_type"),
|
|
478
|
+
"event_type": rec.get("event_type"),
|
|
479
|
+
"ts": rec.get("ts"),
|
|
480
|
+
"content": rec.get("content"),
|
|
481
|
+
"source_id": SOURCE_ID_IMESSAGE,
|
|
482
|
+
}
|
|
483
|
+
for rec in staging_records
|
|
484
|
+
]
|
|
485
|
+
_run_local_sync_enrichment_if_enabled(
|
|
486
|
+
db_conn=db_conn,
|
|
487
|
+
source_id=SOURCE_ID_IMESSAGE,
|
|
488
|
+
canonical_messages=canonical_messages,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
total_processed += len(normalized_records)
|
|
492
|
+
|
|
493
|
+
if max_rowid is None:
|
|
494
|
+
# Defensive: avoid infinite loops if no valid rowid in batch.
|
|
495
|
+
break
|
|
496
|
+
|
|
497
|
+
final_last_record_id = f"imessage:{max_rowid}"
|
|
498
|
+
store.save_checkpoint(IngestionCheckpoint(
|
|
499
|
+
dataset_id=dataset_id,
|
|
500
|
+
schema_id=IMESSAGE_SCHEMA_ID,
|
|
501
|
+
last_record_id=final_last_record_id,
|
|
502
|
+
metadata={},
|
|
503
|
+
))
|
|
504
|
+
current_last_record_id = final_last_record_id
|
|
505
|
+
|
|
506
|
+
if len(rows) < batch_size:
|
|
507
|
+
break
|
|
508
|
+
|
|
509
|
+
return {
|
|
510
|
+
"status": "ok",
|
|
511
|
+
"records_processed": total_processed,
|
|
512
|
+
"last_record_id": final_last_record_id,
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
SIGNAL_SCHEMA_ID = "signal.messages.v1"
|
|
517
|
+
SOURCE_ID_SIGNAL = "signal"
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def run_signal_upload(
|
|
521
|
+
dataset_id: str,
|
|
522
|
+
file_bytes: bytes,
|
|
523
|
+
*,
|
|
524
|
+
my_phone_number: Optional[str] = None,
|
|
525
|
+
owner_user_id: Optional[str] = None,
|
|
526
|
+
db_conn: Optional[Any] = None,
|
|
527
|
+
) -> Dict[str, Any]:
|
|
528
|
+
"""
|
|
529
|
+
Parse Signal export file (JSON) and write to conversation_messages.
|
|
530
|
+
Uses stored Signal identity for dataset_id if my_phone_number not provided.
|
|
531
|
+
"""
|
|
532
|
+
if not dataset_id:
|
|
533
|
+
return {"status": "error", "error": "dataset_id required", "records_processed": 0}
|
|
534
|
+
if not file_bytes:
|
|
535
|
+
return {"status": "error", "error": "file_bytes required", "records_processed": 0}
|
|
536
|
+
|
|
537
|
+
if db_conn is None:
|
|
538
|
+
from ..core.state import get_db_connection
|
|
539
|
+
db_conn = get_db_connection()
|
|
540
|
+
if db_conn is None:
|
|
541
|
+
return {"status": "error", "error": "Database connection not available", "records_processed": 0}
|
|
542
|
+
|
|
543
|
+
if my_phone_number is None and owner_user_id is None:
|
|
544
|
+
from ..storage.signal_identity import get_signal_identity
|
|
545
|
+
identity = get_signal_identity(db_conn, dataset_id)
|
|
546
|
+
if identity:
|
|
547
|
+
my_phone_number = my_phone_number or identity.get("my_phone_number")
|
|
548
|
+
owner_user_id = owner_user_id or dataset_id
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
from .sources.signal_export_parser import parse_signal_export_json
|
|
552
|
+
records = parse_signal_export_json(
|
|
553
|
+
file_bytes,
|
|
554
|
+
my_phone_number=my_phone_number,
|
|
555
|
+
owner_user_id=owner_user_id,
|
|
556
|
+
)
|
|
557
|
+
except ValueError as e:
|
|
558
|
+
return {"status": "error", "error": str(e), "records_processed": 0}
|
|
559
|
+
|
|
560
|
+
if not records:
|
|
561
|
+
return {"status": "ok", "records_processed": 0}
|
|
562
|
+
|
|
563
|
+
# Persist raw Signal payloads for traceability and debugging (non-fatal on failure).
|
|
564
|
+
try:
|
|
565
|
+
from ..storage.raw.raw_tables_manager import RawTablesManager
|
|
566
|
+
raw_tables_manager = RawTablesManager(db_conn)
|
|
567
|
+
for rec in records:
|
|
568
|
+
raw_tables_manager.write_raw_record(
|
|
569
|
+
source_id=SOURCE_ID_SIGNAL,
|
|
570
|
+
source_record_id=str(rec.get("message_id") or rec.get("id") or ""),
|
|
571
|
+
payload=rec,
|
|
572
|
+
source_type="chat_messages",
|
|
573
|
+
)
|
|
574
|
+
except Exception as e:
|
|
575
|
+
logger.warning("[PIPELINE:RAW] Signal upload raw write failed (non-fatal): %s", e)
|
|
576
|
+
|
|
577
|
+
for rec in records:
|
|
578
|
+
rec["dataset_id"] = dataset_id
|
|
579
|
+
_resolve_signal_reply_links(db_conn=db_conn, dataset_id=dataset_id, staging_records=records)
|
|
580
|
+
try:
|
|
581
|
+
from ..storage.canonical import ConversationsTablesManager
|
|
582
|
+
manager = ConversationsTablesManager(db_conn)
|
|
583
|
+
manager.upsert_message_batch(records, dataset_id, SOURCE_ID_SIGNAL)
|
|
584
|
+
_backfill_signal_reply_links_in_db(db_conn=db_conn, dataset_id=dataset_id)
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logger.exception("Signal upload: upsert_message_batch failed")
|
|
587
|
+
return {"status": "error", "error": str(e), "records_processed": 0}
|
|
588
|
+
|
|
589
|
+
canonical_messages = [
|
|
590
|
+
{
|
|
591
|
+
"message_id": rec.get("message_id"),
|
|
592
|
+
"conversation_id": rec.get("thread_id") or rec.get("conversation_id") or dataset_id,
|
|
593
|
+
"sender_type": rec.get("sender_type"),
|
|
594
|
+
"sender_id": rec.get("sender_id"),
|
|
595
|
+
"reply_to_message_id": rec.get("reply_to_message_id"),
|
|
596
|
+
"message_type": rec.get("message_type"),
|
|
597
|
+
"event_type": rec.get("event_type"),
|
|
598
|
+
"ts": rec.get("ts"),
|
|
599
|
+
"content": rec.get("content"),
|
|
600
|
+
"source_id": SOURCE_ID_SIGNAL,
|
|
601
|
+
}
|
|
602
|
+
for rec in records
|
|
603
|
+
]
|
|
604
|
+
_run_local_sync_enrichment_if_enabled(
|
|
605
|
+
db_conn=db_conn,
|
|
606
|
+
source_id=SOURCE_ID_SIGNAL,
|
|
607
|
+
canonical_messages=canonical_messages,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
return {"status": "ok", "records_processed": len(records)}
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def run_signal_sync(
|
|
614
|
+
dataset_id: str,
|
|
615
|
+
*,
|
|
616
|
+
checkpoint_store: Optional[CheckpointStore] = None,
|
|
617
|
+
db_conn: Optional[Any] = None,
|
|
618
|
+
my_phone_number: Optional[str] = None,
|
|
619
|
+
owner_user_id: Optional[str] = None,
|
|
620
|
+
batch_size: int = 5000,
|
|
621
|
+
sync_options: Optional[Dict[str, Any]] = None,
|
|
622
|
+
) -> Dict[str, Any]:
|
|
623
|
+
"""
|
|
624
|
+
Run Signal sync: load checkpoint → read from SQLCipher DB → parse → write to conversation_messages → save checkpoint.
|
|
625
|
+
Requires pysqlcipher3. Uses stored Signal identity if my_phone_number/owner_user_id not provided.
|
|
626
|
+
"""
|
|
627
|
+
if not dataset_id:
|
|
628
|
+
return {"status": "error", "error": "dataset_id required", "records_processed": 0}
|
|
629
|
+
|
|
630
|
+
if db_conn is None:
|
|
631
|
+
from ..core.state import get_db_connection
|
|
632
|
+
db_conn = get_db_connection()
|
|
633
|
+
if db_conn is None:
|
|
634
|
+
return {"status": "error", "error": "Database connection not available", "records_processed": 0}
|
|
635
|
+
|
|
636
|
+
identity = None
|
|
637
|
+
if my_phone_number is None or owner_user_id is None:
|
|
638
|
+
from ..storage.signal_identity import get_signal_identity
|
|
639
|
+
identity = get_signal_identity(db_conn, dataset_id)
|
|
640
|
+
my_phone_number = my_phone_number or (identity.get("my_phone_number") if identity else None)
|
|
641
|
+
owner_user_id = owner_user_id or dataset_id
|
|
642
|
+
|
|
643
|
+
store = checkpoint_store if checkpoint_store is not None else SqliteCheckpointStore(db_conn)
|
|
644
|
+
checkpoint = store.get_checkpoint(dataset_id, SIGNAL_SCHEMA_ID)
|
|
645
|
+
last_record_id = checkpoint.last_record_id if checkpoint else "0"
|
|
646
|
+
start_unix, start_error = _resolve_sync_start_unix(sync_options)
|
|
647
|
+
if start_error:
|
|
648
|
+
return {"status": "error", "error": start_error, "records_processed": 0}
|
|
649
|
+
signal_key_hex = None
|
|
650
|
+
if isinstance(sync_options, dict):
|
|
651
|
+
candidate = sync_options.get("signal_hex_key")
|
|
652
|
+
if isinstance(candidate, str) and candidate.strip():
|
|
653
|
+
signal_key_hex = candidate.strip()
|
|
654
|
+
|
|
655
|
+
parser_cls = PARSER_REGISTRY.get(SIGNAL_SCHEMA_ID)
|
|
656
|
+
if not parser_cls:
|
|
657
|
+
return {"status": "error", "error": "No parser for signal.messages.v1", "records_processed": 0}
|
|
658
|
+
parser = parser_cls(dataset_id=dataset_id, _schema_id=SIGNAL_SCHEMA_ID)
|
|
659
|
+
from .sources.signal_reader import read_signal_rows
|
|
660
|
+
from ..storage.canonical import ConversationsTablesManager
|
|
661
|
+
manager = ConversationsTablesManager(db_conn)
|
|
662
|
+
|
|
663
|
+
current_last_record_id = "0" if start_unix is not None else last_record_id
|
|
664
|
+
final_last_record_id = last_record_id
|
|
665
|
+
total_processed = 0
|
|
666
|
+
|
|
667
|
+
while True:
|
|
668
|
+
try:
|
|
669
|
+
rows = read_signal_rows(
|
|
670
|
+
last_record_id=current_last_record_id if current_last_record_id != "0" else None,
|
|
671
|
+
my_phone_number=my_phone_number,
|
|
672
|
+
batch_size=batch_size,
|
|
673
|
+
start_unix=start_unix,
|
|
674
|
+
signal_key_hex=signal_key_hex,
|
|
675
|
+
)
|
|
676
|
+
except ImportError as e:
|
|
677
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
678
|
+
except FileNotFoundError as e:
|
|
679
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
680
|
+
except ValueError as e:
|
|
681
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
682
|
+
except Exception as e:
|
|
683
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
684
|
+
|
|
685
|
+
if not rows:
|
|
686
|
+
break
|
|
687
|
+
|
|
688
|
+
# Persist raw Signal payloads for traceability and debugging (non-fatal on failure).
|
|
689
|
+
try:
|
|
690
|
+
from ..storage.raw.raw_tables_manager import RawTablesManager
|
|
691
|
+
raw_tables_manager = RawTablesManager(db_conn)
|
|
692
|
+
for row in rows:
|
|
693
|
+
raw_tables_manager.write_raw_record(
|
|
694
|
+
source_id=SOURCE_ID_SIGNAL,
|
|
695
|
+
source_record_id=str(row.get("id") or ""),
|
|
696
|
+
payload=row,
|
|
697
|
+
source_type="chat_messages",
|
|
698
|
+
)
|
|
699
|
+
except Exception as e:
|
|
700
|
+
logger.warning("[PIPELINE:RAW] Signal sync raw write failed (non-fatal): %s", e)
|
|
701
|
+
|
|
702
|
+
row_norm_pairs: List[tuple[Dict[str, Any], Any]] = []
|
|
703
|
+
max_sent_at: Optional[float] = None
|
|
704
|
+
for row in rows:
|
|
705
|
+
raw = RawRecord(record_id=row["id"], payload=row)
|
|
706
|
+
validation = parser.validate(raw)
|
|
707
|
+
if not validation.is_valid:
|
|
708
|
+
logger.debug("Skip invalid row: %s", validation.errors)
|
|
709
|
+
continue
|
|
710
|
+
norm = parser.parse(raw)
|
|
711
|
+
row_norm_pairs.append((row, norm))
|
|
712
|
+
sat = row.get("sent_at")
|
|
713
|
+
if sat is not None and (max_sent_at is None or sat > max_sent_at):
|
|
714
|
+
max_sent_at = sat
|
|
715
|
+
|
|
716
|
+
if not row_norm_pairs:
|
|
717
|
+
if len(rows) < batch_size:
|
|
718
|
+
break
|
|
719
|
+
if max_sent_at is not None:
|
|
720
|
+
current_last_record_id = f"signal:0:{max_sent_at:.6f}"
|
|
721
|
+
final_last_record_id = current_last_record_id
|
|
722
|
+
store.save_checkpoint(IngestionCheckpoint(
|
|
723
|
+
dataset_id=dataset_id,
|
|
724
|
+
schema_id=SIGNAL_SCHEMA_ID,
|
|
725
|
+
last_record_id=final_last_record_id,
|
|
726
|
+
metadata={},
|
|
727
|
+
))
|
|
728
|
+
else:
|
|
729
|
+
break
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
normalized_records = [norm for _, norm in row_norm_pairs]
|
|
733
|
+
mapped_records = _map_normalized_records_with_canonical_mapper(
|
|
734
|
+
normalized_records,
|
|
735
|
+
source_id=SOURCE_ID_SIGNAL,
|
|
736
|
+
)
|
|
737
|
+
mapped_by_message_id = {
|
|
738
|
+
str(rec.get("message_id")): rec
|
|
739
|
+
for rec in mapped_records
|
|
740
|
+
if rec.get("message_id") is not None
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
staging_records: List[Dict[str, Any]] = []
|
|
744
|
+
for row, norm in row_norm_pairs:
|
|
745
|
+
p = norm.payload
|
|
746
|
+
mapped = mapped_by_message_id.get(str(p.get("message_id")), {})
|
|
747
|
+
from_self = (row.get("role") == "user")
|
|
748
|
+
sender_id = mapped.get("sender_id") or p.get("sender_id") or row.get("sender_id")
|
|
749
|
+
if not sender_id:
|
|
750
|
+
sender_id = "self" if from_self else f"unknown:{p.get('thread_id') or p.get('message_id') or 'signal'}"
|
|
751
|
+
staging_records.append({
|
|
752
|
+
"message_id": mapped.get("message_id") or p.get("message_id"),
|
|
753
|
+
"dataset_id": dataset_id,
|
|
754
|
+
"thread_id": mapped.get("thread_id") or mapped.get("conversation_id") or p.get("thread_id") or p.get("conversation_id") or dataset_id,
|
|
755
|
+
"ts": mapped.get("ts") or p.get("ts") or datetime.now(timezone.utc).isoformat(),
|
|
756
|
+
"sender_type": "self" if from_self else "contact",
|
|
757
|
+
"sender_id": str(sender_id),
|
|
758
|
+
"reply_to_message_id": mapped.get("reply_to_message_id") or p.get("reply_to_message_id"),
|
|
759
|
+
"message_type": mapped.get("message_type") or p.get("message_type"),
|
|
760
|
+
"event_type": mapped.get("event_type") or p.get("event_type"),
|
|
761
|
+
"content": mapped.get("content") if mapped.get("content") is not None else p.get("content"),
|
|
762
|
+
"source_id": SOURCE_ID_SIGNAL,
|
|
763
|
+
"from_self": from_self,
|
|
764
|
+
"owner_user_id": owner_user_id,
|
|
765
|
+
})
|
|
766
|
+
if "_metadata" in mapped:
|
|
767
|
+
staging_records[-1]["_metadata"] = mapped["_metadata"]
|
|
768
|
+
elif "_metadata" in p:
|
|
769
|
+
staging_records[-1]["_metadata"] = p["_metadata"]
|
|
770
|
+
|
|
771
|
+
_resolve_signal_reply_links(
|
|
772
|
+
db_conn=db_conn,
|
|
773
|
+
dataset_id=dataset_id,
|
|
774
|
+
staging_records=staging_records,
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
try:
|
|
778
|
+
manager.upsert_message_batch(staging_records, dataset_id, SOURCE_ID_SIGNAL)
|
|
779
|
+
_backfill_signal_reply_links_in_db(db_conn=db_conn, dataset_id=dataset_id)
|
|
780
|
+
except Exception as e:
|
|
781
|
+
logger.exception("Signal sync: upsert_message_batch failed")
|
|
782
|
+
return {"status": "error", "error": str(e), "records_processed": total_processed}
|
|
783
|
+
|
|
784
|
+
canonical_messages = [
|
|
785
|
+
{
|
|
786
|
+
"message_id": rec.get("message_id"),
|
|
787
|
+
"conversation_id": rec.get("thread_id") or dataset_id,
|
|
788
|
+
"sender_type": rec.get("sender_type"),
|
|
789
|
+
"sender_id": rec.get("sender_id"),
|
|
790
|
+
"reply_to_message_id": rec.get("reply_to_message_id"),
|
|
791
|
+
"message_type": rec.get("message_type"),
|
|
792
|
+
"event_type": rec.get("event_type"),
|
|
793
|
+
"ts": rec.get("ts"),
|
|
794
|
+
"content": rec.get("content"),
|
|
795
|
+
"source_id": SOURCE_ID_SIGNAL,
|
|
796
|
+
}
|
|
797
|
+
for rec in staging_records
|
|
798
|
+
]
|
|
799
|
+
_run_local_sync_enrichment_if_enabled(
|
|
800
|
+
db_conn=db_conn,
|
|
801
|
+
source_id=SOURCE_ID_SIGNAL,
|
|
802
|
+
canonical_messages=canonical_messages,
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
total_processed += len(row_norm_pairs)
|
|
806
|
+
if max_sent_at is not None:
|
|
807
|
+
final_last_record_id = f"signal:0:{max_sent_at:.6f}"
|
|
808
|
+
store.save_checkpoint(IngestionCheckpoint(
|
|
809
|
+
dataset_id=dataset_id,
|
|
810
|
+
schema_id=SIGNAL_SCHEMA_ID,
|
|
811
|
+
last_record_id=final_last_record_id,
|
|
812
|
+
metadata={},
|
|
813
|
+
))
|
|
814
|
+
current_last_record_id = final_last_record_id
|
|
815
|
+
|
|
816
|
+
if len(rows) < batch_size:
|
|
817
|
+
break
|
|
818
|
+
|
|
819
|
+
return {
|
|
820
|
+
"status": "ok",
|
|
821
|
+
"records_processed": total_processed,
|
|
822
|
+
"last_record_id": final_last_record_id,
|
|
823
|
+
}
|