topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,823 @@
1
+ """Local sync ingestion: iMessage, Signal (read from local DB, write to conversation_messages)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sqlite3
7
+ from datetime import datetime, timedelta, timezone
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from .checkpoints.checkpoint_store import CheckpointStore, IngestionCheckpoint
11
+ from .checkpoints.sqlite_checkpoint_store import SqliteCheckpointStore
12
+ from .parsers import PARSER_REGISTRY
13
+ from .sources.base import RawRecord
14
+
15
+ logger = logging.getLogger("topos.ingestion.local_sync")
16
+
17
+ IMESSAGE_SCHEMA_ID = "imessage.messages.v1"
18
+ SOURCE_ID_IMESSAGE = "imessage"
19
+
20
+
21
+ def _run_local_sync_enrichment_if_enabled(
22
+ *,
23
+ db_conn: Any,
24
+ source_id: str,
25
+ canonical_messages: List[Dict[str, Any]],
26
+ ) -> None:
27
+ """Run canonical enrichment for local_sync sources when trigger is automatic."""
28
+ if not canonical_messages:
29
+ return
30
+ try:
31
+ from ..sources.registry import REGISTRY
32
+ source_def = REGISTRY.get(source_id)
33
+ if not source_def:
34
+ return
35
+ if getattr(source_def, "enrichment_trigger", "manual") != "automatic":
36
+ return
37
+ job_names = list(getattr(source_def, "canonical_enrichment_jobs", []) or [])
38
+ if not job_names:
39
+ return
40
+ from ..enrichment.derived_tables import DerivedTablesManager
41
+ from ..enrichment.orchestrator import EnrichmentOrchestrator
42
+ import asyncio as _asyncio
43
+
44
+ orchestrator = EnrichmentOrchestrator(tables_manager=DerivedTablesManager(conn=db_conn))
45
+ _asyncio.run(orchestrator.run_canonical(canonical_messages, job_names=job_names))
46
+ except Exception as e:
47
+ logger.warning(
48
+ "[PIPELINE:ENRICHMENT] local_sync enrichment failed (non-fatal): source_id=%s error=%s",
49
+ source_id,
50
+ e,
51
+ exc_info=True,
52
+ )
53
+
54
+
55
+ def _resolve_sync_start_unix(options: Optional[Dict[str, Any]]) -> tuple[Optional[float], Optional[str]]:
56
+ """Resolve sync start timestamp from sync options."""
57
+ if not options:
58
+ return None, None
59
+ mode = str(options.get("mode") or "all").strip().lower()
60
+ if mode in {"", "all"}:
61
+ return None, None
62
+ now = datetime.now(timezone.utc)
63
+ if mode == "1m":
64
+ return (now - timedelta(days=30)).timestamp(), None
65
+ if mode == "3m":
66
+ return (now - timedelta(days=90)).timestamp(), None
67
+ if mode == "6m":
68
+ return (now - timedelta(days=180)).timestamp(), None
69
+ if mode == "1y":
70
+ return (now - timedelta(days=365)).timestamp(), None
71
+ if mode == "5y":
72
+ return (now - timedelta(days=365 * 5)).timestamp(), None
73
+ if mode == "custom":
74
+ start_raw = options.get("start_date")
75
+ if not start_raw:
76
+ return None, "start_date is required for custom sync mode"
77
+ try:
78
+ start_text = str(start_raw).strip()
79
+ if len(start_text) == 10:
80
+ dt = datetime.fromisoformat(start_text).replace(tzinfo=timezone.utc)
81
+ else:
82
+ dt = datetime.fromisoformat(start_text.replace("Z", "+00:00"))
83
+ if dt.tzinfo is None:
84
+ dt = dt.replace(tzinfo=timezone.utc)
85
+ else:
86
+ dt = dt.astimezone(timezone.utc)
87
+ return dt.timestamp(), None
88
+ except Exception:
89
+ return None, f"invalid start_date: {start_raw}"
90
+ return None, f"unknown sync mode: {mode}"
91
+
92
+
93
+ def _map_normalized_records_with_canonical_mapper(
94
+ normalized_records: List[Any],
95
+ *,
96
+ source_id: str,
97
+ ) -> List[Dict[str, Any]]:
98
+ """Map normalized records through source canonical mapper (with fallback)."""
99
+ try:
100
+ from ..canonicalization.mappers import MAPPER_REGISTRY
101
+ from ..sources.registry import REGISTRY
102
+
103
+ source_def = REGISTRY.get(source_id)
104
+ mapper_id = getattr(source_def, "canonical_mapper_id", None) if source_def else None
105
+ mapper_cls = MAPPER_REGISTRY.get(mapper_id) if mapper_id else None
106
+ if not mapper_cls:
107
+ raise ValueError(f"No canonical mapper registered for source_id={source_id} mapper_id={mapper_id}")
108
+ mapper = mapper_cls()
109
+ out: List[Dict[str, Any]] = []
110
+ for norm in normalized_records:
111
+ canonical = mapper.map(norm)
112
+ payload = dict(canonical.payload or {})
113
+ payload["source_id"] = source_id
114
+ out.append(payload)
115
+ return out
116
+ except Exception as e:
117
+ logger.warning(
118
+ "[PIPELINE:CANONICAL] local_sync mapper unavailable for source_id=%s, using fallback payload mapping: %s",
119
+ source_id,
120
+ e,
121
+ )
122
+ out: List[Dict[str, Any]] = []
123
+ for norm in normalized_records:
124
+ p = dict(getattr(norm, "payload", {}) or {})
125
+ if not p.get("message_id"):
126
+ p["message_id"] = getattr(norm, "record_id", None)
127
+ if not p.get("conversation_id"):
128
+ p["conversation_id"] = p.get("thread_id")
129
+ p["source_id"] = source_id
130
+ out.append(p)
131
+ return out
132
+
133
+
134
+ def _signal_reply_source_key_to_seconds(source_key: Any) -> Optional[int]:
135
+ """Normalize Signal reply source key variants to Unix seconds for lookup."""
136
+ if source_key is None:
137
+ return None
138
+ text = str(source_key).strip()
139
+ if not text:
140
+ return None
141
+ if text.startswith("signal:"):
142
+ parts = text.split(":")
143
+ if len(parts) >= 3:
144
+ try:
145
+ return int(float(parts[-1]))
146
+ except Exception:
147
+ return None
148
+ try:
149
+ value = int(float(text))
150
+ except Exception:
151
+ return None
152
+ # Common Signal quote.id style is milliseconds.
153
+ if abs(value) >= 1_000_000_000_000:
154
+ return int(value / 1000)
155
+ return value
156
+
157
+
158
+ def _resolve_signal_reply_links(
159
+ *,
160
+ db_conn: Any,
161
+ dataset_id: str,
162
+ staging_records: List[Dict[str, Any]],
163
+ ) -> None:
164
+ """Resolve Signal reply source keys to canonical message_id when possible.
165
+
166
+ This mutates staging_records in-place:
167
+ - preserves original source reply key in _metadata.reply_to_source_key
168
+ - updates reply_to_message_id to canonical message_id when matched
169
+ """
170
+ if not staging_records:
171
+ return
172
+
173
+ # Build in-batch lookup by (conversation/thread id, sent_at_seconds) -> message_id.
174
+ batch_lookup: Dict[tuple[str, int], str] = {}
175
+ for rec in staging_records:
176
+ message_id = str(rec.get("message_id") or "")
177
+ thread_id = str(rec.get("thread_id") or rec.get("conversation_id") or "")
178
+ if not message_id or not thread_id:
179
+ continue
180
+ sec = _signal_reply_source_key_to_seconds(message_id)
181
+ if sec is not None:
182
+ batch_lookup[(thread_id, sec)] = message_id
183
+
184
+ for rec in staging_records:
185
+ source_key = rec.get("reply_to_message_id")
186
+ if source_key is None:
187
+ continue
188
+
189
+ # Always preserve source-native linkage in metadata for traceability.
190
+ if "_metadata" not in rec or not isinstance(rec.get("_metadata"), dict):
191
+ rec["_metadata"] = {}
192
+ rec["_metadata"]["reply_to_source_key"] = source_key
193
+
194
+ source_key_text = str(source_key).strip()
195
+ if not source_key_text:
196
+ rec["reply_to_message_id"] = None
197
+ continue
198
+
199
+ # Already canonical format.
200
+ if source_key_text.startswith("signal:"):
201
+ rec["reply_to_message_id"] = source_key_text
202
+ continue
203
+
204
+ thread_id = str(rec.get("thread_id") or rec.get("conversation_id") or "")
205
+ sec = _signal_reply_source_key_to_seconds(source_key_text)
206
+ resolved: Optional[str] = None
207
+
208
+ if sec is not None and thread_id:
209
+ resolved = batch_lookup.get((thread_id, sec))
210
+
211
+ # Fallback lookup in already-ingested canonical rows.
212
+ if resolved is None and sec is not None and thread_id and db_conn is not None:
213
+ like_suffix = f"%:{sec}"
214
+ row = db_conn.execute(
215
+ """
216
+ SELECT message_id
217
+ FROM conversation_messages
218
+ WHERE dataset_id = ?
219
+ AND source_id = 'signal'
220
+ AND conversation_id = ?
221
+ AND message_id LIKE ?
222
+ ORDER BY event_at DESC
223
+ LIMIT 1
224
+ """,
225
+ (dataset_id, thread_id, like_suffix),
226
+ ).fetchone()
227
+ if row:
228
+ resolved = row[0]
229
+
230
+ # Store canonical link when matched; otherwise keep source key for now.
231
+ if resolved:
232
+ rec["reply_to_message_id"] = resolved
233
+ else:
234
+ rec["reply_to_message_id"] = source_key_text
235
+
236
+
237
+ def _backfill_signal_reply_links_in_db(*, db_conn: Any, dataset_id: str) -> int:
238
+ """Resolve persisted Signal reply keys (ms/sec source keys -> canonical message_id)."""
239
+ if db_conn is None:
240
+ return 0
241
+ updated = 0
242
+ rows = db_conn.execute(
243
+ """
244
+ SELECT message_id, conversation_id, reply_to_message_id
245
+ FROM conversation_messages
246
+ WHERE dataset_id = ?
247
+ AND source_id = 'signal'
248
+ AND reply_to_message_id IS NOT NULL
249
+ AND reply_to_message_id != ''
250
+ AND reply_to_message_id NOT LIKE 'signal:%'
251
+ """,
252
+ (dataset_id,),
253
+ ).fetchall()
254
+ for row in rows:
255
+ row_message_id, conversation_id, reply_key = row
256
+ sec = _signal_reply_source_key_to_seconds(reply_key)
257
+ if sec is None:
258
+ continue
259
+ target = db_conn.execute(
260
+ """
261
+ SELECT message_id
262
+ FROM conversation_messages
263
+ WHERE dataset_id = ?
264
+ AND source_id = 'signal'
265
+ AND conversation_id = ?
266
+ AND message_id LIKE ?
267
+ ORDER BY event_at DESC
268
+ LIMIT 1
269
+ """,
270
+ (dataset_id, conversation_id, f"%:{sec}"),
271
+ ).fetchone()
272
+ if not target:
273
+ continue
274
+ resolved_message_id = target[0]
275
+ if not resolved_message_id or resolved_message_id == row_message_id:
276
+ continue
277
+ db_conn.execute(
278
+ """
279
+ UPDATE conversation_messages
280
+ SET reply_to_message_id = ?
281
+ WHERE message_id = ?
282
+ """,
283
+ (resolved_message_id, row_message_id),
284
+ )
285
+ updated += 1
286
+ if updated:
287
+ db_conn.commit()
288
+ return updated
289
+
290
+
291
+ def run_imessage_sync(
292
+ dataset_id: str,
293
+ *,
294
+ checkpoint_store: Optional[CheckpointStore] = None,
295
+ db_conn: Optional[Any] = None,
296
+ chat_db_path: Optional[Any] = None,
297
+ batch_size: int = 5000,
298
+ sync_options: Optional[Dict[str, Any]] = None,
299
+ ) -> Dict[str, Any]:
300
+ """
301
+ Run iMessage sync: load checkpoint → read from chat.db → parse → write to conversation_messages → save checkpoint.
302
+ Returns dict with status, records_processed, last_record_id, error (if any).
303
+ """
304
+ if not dataset_id:
305
+ return {"status": "error", "error": "dataset_id required", "records_processed": 0}
306
+
307
+ if db_conn is None:
308
+ from ..core.state import get_db_connection
309
+ db_conn = get_db_connection()
310
+ if db_conn is None:
311
+ return {"status": "error", "error": "Database connection not available", "records_processed": 0}
312
+
313
+ store = checkpoint_store if checkpoint_store is not None else SqliteCheckpointStore(db_conn)
314
+ checkpoint = store.get_checkpoint(dataset_id, IMESSAGE_SCHEMA_ID)
315
+ last_record_id = checkpoint.last_record_id if checkpoint else "0"
316
+
317
+ logger.info(
318
+ "run_imessage_sync starting: dataset_id=%s last_record_id=%s",
319
+ dataset_id[:24] + "..." if len(dataset_id) > 24 else dataset_id,
320
+ last_record_id[:20] + "..." if last_record_id and len(last_record_id) > 20 else last_record_id,
321
+ )
322
+
323
+ try:
324
+ return _run_imessage_sync_impl(
325
+ dataset_id=dataset_id,
326
+ db_conn=db_conn,
327
+ store=store,
328
+ last_record_id=last_record_id,
329
+ chat_db_path=chat_db_path,
330
+ batch_size=batch_size,
331
+ sync_options=sync_options,
332
+ )
333
+ except Exception as e:
334
+ logger.warning(
335
+ "run_imessage_sync failed (top-level catch): %s",
336
+ e,
337
+ exc_info=True,
338
+ )
339
+ return {"status": "error", "error": str(e), "records_processed": 0}
340
+
341
+
342
+ def _run_imessage_sync_impl(
343
+ dataset_id: str,
344
+ *,
345
+ db_conn: Any,
346
+ store: CheckpointStore,
347
+ last_record_id: str,
348
+ chat_db_path: Optional[Any] = None,
349
+ batch_size: int = 5000,
350
+ sync_options: Optional[Dict[str, Any]] = None,
351
+ ) -> Dict[str, Any]:
352
+ """Implementation of run_imessage_sync (called inside try so we never raise)."""
353
+ start_unix, start_error = _resolve_sync_start_unix(sync_options)
354
+ if start_error:
355
+ return {"status": "error", "error": start_error, "records_processed": 0}
356
+
357
+ parser_cls = PARSER_REGISTRY.get(IMESSAGE_SCHEMA_ID)
358
+ if not parser_cls:
359
+ return {"status": "error", "error": "No parser for imessage.messages.v1", "records_processed": 0}
360
+ parser = parser_cls(dataset_id=dataset_id, _schema_id=IMESSAGE_SCHEMA_ID)
361
+ from ..storage.canonical import ConversationsTablesManager
362
+ manager = ConversationsTablesManager(db_conn)
363
+ from .sources.imessage_reader import read_imessage_rows_list, get_chat_db_path
364
+ path = chat_db_path or get_chat_db_path()
365
+
366
+ # For bounded history sync, restart from row 0 and apply time filter.
367
+ current_last_record_id = "0" if start_unix is not None else last_record_id
368
+ final_last_record_id = last_record_id
369
+ total_processed = 0
370
+ batch_num = 0
371
+
372
+ while True:
373
+ batch_num += 1
374
+ try:
375
+ rows = read_imessage_rows_list(
376
+ last_rowid=current_last_record_id if current_last_record_id != "0" else None,
377
+ chat_db_path=path,
378
+ batch_size=batch_size,
379
+ start_unix=start_unix,
380
+ )
381
+ except FileNotFoundError as e:
382
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
383
+ except PermissionError as e:
384
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
385
+ except OSError as e:
386
+ logger.warning(
387
+ "imessage read failed (OSError errno=%s) on batch %d: %s",
388
+ getattr(e, "errno", None),
389
+ batch_num,
390
+ e,
391
+ exc_info=True,
392
+ )
393
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
394
+ except sqlite3.Error as e:
395
+ logger.warning(
396
+ "imessage read failed (sqlite3.Error) on batch %d: %s",
397
+ batch_num,
398
+ e,
399
+ exc_info=True,
400
+ )
401
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
402
+ except Exception as e:
403
+ logger.warning("imessage read failed on batch %d: %s", batch_num, e, exc_info=True)
404
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
405
+
406
+ if not rows:
407
+ break
408
+
409
+ # Persist raw iMessage payloads for traceability and debugging (non-fatal on failure).
410
+ try:
411
+ from ..storage.raw.raw_tables_manager import RawTablesManager
412
+ raw_tables_manager = RawTablesManager(db_conn)
413
+ for row in rows:
414
+ raw_tables_manager.write_raw_record(
415
+ source_id=SOURCE_ID_IMESSAGE,
416
+ source_record_id=str(row.get("id") or ""),
417
+ payload=row,
418
+ source_type="chat_messages",
419
+ )
420
+ except Exception as e:
421
+ logger.warning("[PIPELINE:RAW] iMessage raw write failed (non-fatal): %s", e)
422
+
423
+ normalized_records: List[Any] = []
424
+ max_rowid: Optional[int] = None
425
+ for row in rows:
426
+ raw = RawRecord(record_id=row["id"], payload=row)
427
+ validation = parser.validate(raw)
428
+ if not validation.is_valid:
429
+ logger.debug("Skip invalid row: %s", validation.errors)
430
+ continue
431
+ norm = parser.parse(raw)
432
+ normalized_records.append(norm)
433
+ rid = row.get("ROWID")
434
+ if rid is not None and (max_rowid is None or rid > max_rowid):
435
+ max_rowid = rid
436
+
437
+ if normalized_records:
438
+ mapped_records = _map_normalized_records_with_canonical_mapper(
439
+ normalized_records,
440
+ source_id=SOURCE_ID_IMESSAGE,
441
+ )
442
+ staging_records: List[Dict[str, Any]] = []
443
+ for rec in mapped_records:
444
+ thread_id = rec.get("thread_id") or rec.get("conversation_id") or dataset_id
445
+ is_self = str(rec.get("sender_id") or "").strip().lower() == "self"
446
+ staging = {
447
+ "message_id": rec.get("message_id"),
448
+ "dataset_id": dataset_id,
449
+ "thread_id": thread_id,
450
+ "ts": rec.get("ts") or datetime.now(timezone.utc).isoformat(),
451
+ "sender_type": rec.get("sender_type", "human"),
452
+ "sender_id": rec.get("sender_id"),
453
+ "from_self": is_self,
454
+ "reply_to_message_id": rec.get("reply_to_message_id"),
455
+ "message_type": rec.get("message_type"),
456
+ "event_type": rec.get("event_type"),
457
+ "content": rec.get("content"),
458
+ "source_id": SOURCE_ID_IMESSAGE,
459
+ }
460
+ if "_metadata" in rec:
461
+ staging["_metadata"] = rec["_metadata"]
462
+ staging_records.append(staging)
463
+
464
+ try:
465
+ manager.upsert_message_batch(staging_records, dataset_id, SOURCE_ID_IMESSAGE)
466
+ except Exception as e:
467
+ logger.exception("ConversationsTablesManager.upsert_message_batch failed")
468
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
469
+
470
+ canonical_messages = [
471
+ {
472
+ "message_id": rec.get("message_id"),
473
+ "conversation_id": rec.get("thread_id") or dataset_id,
474
+ "sender_type": rec.get("sender_type"),
475
+ "sender_id": rec.get("sender_id"),
476
+ "reply_to_message_id": rec.get("reply_to_message_id"),
477
+ "message_type": rec.get("message_type"),
478
+ "event_type": rec.get("event_type"),
479
+ "ts": rec.get("ts"),
480
+ "content": rec.get("content"),
481
+ "source_id": SOURCE_ID_IMESSAGE,
482
+ }
483
+ for rec in staging_records
484
+ ]
485
+ _run_local_sync_enrichment_if_enabled(
486
+ db_conn=db_conn,
487
+ source_id=SOURCE_ID_IMESSAGE,
488
+ canonical_messages=canonical_messages,
489
+ )
490
+
491
+ total_processed += len(normalized_records)
492
+
493
+ if max_rowid is None:
494
+ # Defensive: avoid infinite loops if no valid rowid in batch.
495
+ break
496
+
497
+ final_last_record_id = f"imessage:{max_rowid}"
498
+ store.save_checkpoint(IngestionCheckpoint(
499
+ dataset_id=dataset_id,
500
+ schema_id=IMESSAGE_SCHEMA_ID,
501
+ last_record_id=final_last_record_id,
502
+ metadata={},
503
+ ))
504
+ current_last_record_id = final_last_record_id
505
+
506
+ if len(rows) < batch_size:
507
+ break
508
+
509
+ return {
510
+ "status": "ok",
511
+ "records_processed": total_processed,
512
+ "last_record_id": final_last_record_id,
513
+ }
514
+
515
+
516
+ SIGNAL_SCHEMA_ID = "signal.messages.v1"
517
+ SOURCE_ID_SIGNAL = "signal"
518
+
519
+
520
+ def run_signal_upload(
521
+ dataset_id: str,
522
+ file_bytes: bytes,
523
+ *,
524
+ my_phone_number: Optional[str] = None,
525
+ owner_user_id: Optional[str] = None,
526
+ db_conn: Optional[Any] = None,
527
+ ) -> Dict[str, Any]:
528
+ """
529
+ Parse Signal export file (JSON) and write to conversation_messages.
530
+ Uses stored Signal identity for dataset_id if my_phone_number not provided.
531
+ """
532
+ if not dataset_id:
533
+ return {"status": "error", "error": "dataset_id required", "records_processed": 0}
534
+ if not file_bytes:
535
+ return {"status": "error", "error": "file_bytes required", "records_processed": 0}
536
+
537
+ if db_conn is None:
538
+ from ..core.state import get_db_connection
539
+ db_conn = get_db_connection()
540
+ if db_conn is None:
541
+ return {"status": "error", "error": "Database connection not available", "records_processed": 0}
542
+
543
+ if my_phone_number is None and owner_user_id is None:
544
+ from ..storage.signal_identity import get_signal_identity
545
+ identity = get_signal_identity(db_conn, dataset_id)
546
+ if identity:
547
+ my_phone_number = my_phone_number or identity.get("my_phone_number")
548
+ owner_user_id = owner_user_id or dataset_id
549
+
550
+ try:
551
+ from .sources.signal_export_parser import parse_signal_export_json
552
+ records = parse_signal_export_json(
553
+ file_bytes,
554
+ my_phone_number=my_phone_number,
555
+ owner_user_id=owner_user_id,
556
+ )
557
+ except ValueError as e:
558
+ return {"status": "error", "error": str(e), "records_processed": 0}
559
+
560
+ if not records:
561
+ return {"status": "ok", "records_processed": 0}
562
+
563
+ # Persist raw Signal payloads for traceability and debugging (non-fatal on failure).
564
+ try:
565
+ from ..storage.raw.raw_tables_manager import RawTablesManager
566
+ raw_tables_manager = RawTablesManager(db_conn)
567
+ for rec in records:
568
+ raw_tables_manager.write_raw_record(
569
+ source_id=SOURCE_ID_SIGNAL,
570
+ source_record_id=str(rec.get("message_id") or rec.get("id") or ""),
571
+ payload=rec,
572
+ source_type="chat_messages",
573
+ )
574
+ except Exception as e:
575
+ logger.warning("[PIPELINE:RAW] Signal upload raw write failed (non-fatal): %s", e)
576
+
577
+ for rec in records:
578
+ rec["dataset_id"] = dataset_id
579
+ _resolve_signal_reply_links(db_conn=db_conn, dataset_id=dataset_id, staging_records=records)
580
+ try:
581
+ from ..storage.canonical import ConversationsTablesManager
582
+ manager = ConversationsTablesManager(db_conn)
583
+ manager.upsert_message_batch(records, dataset_id, SOURCE_ID_SIGNAL)
584
+ _backfill_signal_reply_links_in_db(db_conn=db_conn, dataset_id=dataset_id)
585
+ except Exception as e:
586
+ logger.exception("Signal upload: upsert_message_batch failed")
587
+ return {"status": "error", "error": str(e), "records_processed": 0}
588
+
589
+ canonical_messages = [
590
+ {
591
+ "message_id": rec.get("message_id"),
592
+ "conversation_id": rec.get("thread_id") or rec.get("conversation_id") or dataset_id,
593
+ "sender_type": rec.get("sender_type"),
594
+ "sender_id": rec.get("sender_id"),
595
+ "reply_to_message_id": rec.get("reply_to_message_id"),
596
+ "message_type": rec.get("message_type"),
597
+ "event_type": rec.get("event_type"),
598
+ "ts": rec.get("ts"),
599
+ "content": rec.get("content"),
600
+ "source_id": SOURCE_ID_SIGNAL,
601
+ }
602
+ for rec in records
603
+ ]
604
+ _run_local_sync_enrichment_if_enabled(
605
+ db_conn=db_conn,
606
+ source_id=SOURCE_ID_SIGNAL,
607
+ canonical_messages=canonical_messages,
608
+ )
609
+
610
+ return {"status": "ok", "records_processed": len(records)}
611
+
612
+
613
+ def run_signal_sync(
614
+ dataset_id: str,
615
+ *,
616
+ checkpoint_store: Optional[CheckpointStore] = None,
617
+ db_conn: Optional[Any] = None,
618
+ my_phone_number: Optional[str] = None,
619
+ owner_user_id: Optional[str] = None,
620
+ batch_size: int = 5000,
621
+ sync_options: Optional[Dict[str, Any]] = None,
622
+ ) -> Dict[str, Any]:
623
+ """
624
+ Run Signal sync: load checkpoint → read from SQLCipher DB → parse → write to conversation_messages → save checkpoint.
625
+ Requires pysqlcipher3. Uses stored Signal identity if my_phone_number/owner_user_id not provided.
626
+ """
627
+ if not dataset_id:
628
+ return {"status": "error", "error": "dataset_id required", "records_processed": 0}
629
+
630
+ if db_conn is None:
631
+ from ..core.state import get_db_connection
632
+ db_conn = get_db_connection()
633
+ if db_conn is None:
634
+ return {"status": "error", "error": "Database connection not available", "records_processed": 0}
635
+
636
+ identity = None
637
+ if my_phone_number is None or owner_user_id is None:
638
+ from ..storage.signal_identity import get_signal_identity
639
+ identity = get_signal_identity(db_conn, dataset_id)
640
+ my_phone_number = my_phone_number or (identity.get("my_phone_number") if identity else None)
641
+ owner_user_id = owner_user_id or dataset_id
642
+
643
+ store = checkpoint_store if checkpoint_store is not None else SqliteCheckpointStore(db_conn)
644
+ checkpoint = store.get_checkpoint(dataset_id, SIGNAL_SCHEMA_ID)
645
+ last_record_id = checkpoint.last_record_id if checkpoint else "0"
646
+ start_unix, start_error = _resolve_sync_start_unix(sync_options)
647
+ if start_error:
648
+ return {"status": "error", "error": start_error, "records_processed": 0}
649
+ signal_key_hex = None
650
+ if isinstance(sync_options, dict):
651
+ candidate = sync_options.get("signal_hex_key")
652
+ if isinstance(candidate, str) and candidate.strip():
653
+ signal_key_hex = candidate.strip()
654
+
655
+ parser_cls = PARSER_REGISTRY.get(SIGNAL_SCHEMA_ID)
656
+ if not parser_cls:
657
+ return {"status": "error", "error": "No parser for signal.messages.v1", "records_processed": 0}
658
+ parser = parser_cls(dataset_id=dataset_id, _schema_id=SIGNAL_SCHEMA_ID)
659
+ from .sources.signal_reader import read_signal_rows
660
+ from ..storage.canonical import ConversationsTablesManager
661
+ manager = ConversationsTablesManager(db_conn)
662
+
663
+ current_last_record_id = "0" if start_unix is not None else last_record_id
664
+ final_last_record_id = last_record_id
665
+ total_processed = 0
666
+
667
+ while True:
668
+ try:
669
+ rows = read_signal_rows(
670
+ last_record_id=current_last_record_id if current_last_record_id != "0" else None,
671
+ my_phone_number=my_phone_number,
672
+ batch_size=batch_size,
673
+ start_unix=start_unix,
674
+ signal_key_hex=signal_key_hex,
675
+ )
676
+ except ImportError as e:
677
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
678
+ except FileNotFoundError as e:
679
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
680
+ except ValueError as e:
681
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
682
+ except Exception as e:
683
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
684
+
685
+ if not rows:
686
+ break
687
+
688
+ # Persist raw Signal payloads for traceability and debugging (non-fatal on failure).
689
+ try:
690
+ from ..storage.raw.raw_tables_manager import RawTablesManager
691
+ raw_tables_manager = RawTablesManager(db_conn)
692
+ for row in rows:
693
+ raw_tables_manager.write_raw_record(
694
+ source_id=SOURCE_ID_SIGNAL,
695
+ source_record_id=str(row.get("id") or ""),
696
+ payload=row,
697
+ source_type="chat_messages",
698
+ )
699
+ except Exception as e:
700
+ logger.warning("[PIPELINE:RAW] Signal sync raw write failed (non-fatal): %s", e)
701
+
702
+ row_norm_pairs: List[tuple[Dict[str, Any], Any]] = []
703
+ max_sent_at: Optional[float] = None
704
+ for row in rows:
705
+ raw = RawRecord(record_id=row["id"], payload=row)
706
+ validation = parser.validate(raw)
707
+ if not validation.is_valid:
708
+ logger.debug("Skip invalid row: %s", validation.errors)
709
+ continue
710
+ norm = parser.parse(raw)
711
+ row_norm_pairs.append((row, norm))
712
+ sat = row.get("sent_at")
713
+ if sat is not None and (max_sent_at is None or sat > max_sent_at):
714
+ max_sent_at = sat
715
+
716
+ if not row_norm_pairs:
717
+ if len(rows) < batch_size:
718
+ break
719
+ if max_sent_at is not None:
720
+ current_last_record_id = f"signal:0:{max_sent_at:.6f}"
721
+ final_last_record_id = current_last_record_id
722
+ store.save_checkpoint(IngestionCheckpoint(
723
+ dataset_id=dataset_id,
724
+ schema_id=SIGNAL_SCHEMA_ID,
725
+ last_record_id=final_last_record_id,
726
+ metadata={},
727
+ ))
728
+ else:
729
+ break
730
+ continue
731
+
732
+ normalized_records = [norm for _, norm in row_norm_pairs]
733
+ mapped_records = _map_normalized_records_with_canonical_mapper(
734
+ normalized_records,
735
+ source_id=SOURCE_ID_SIGNAL,
736
+ )
737
+ mapped_by_message_id = {
738
+ str(rec.get("message_id")): rec
739
+ for rec in mapped_records
740
+ if rec.get("message_id") is not None
741
+ }
742
+
743
+ staging_records: List[Dict[str, Any]] = []
744
+ for row, norm in row_norm_pairs:
745
+ p = norm.payload
746
+ mapped = mapped_by_message_id.get(str(p.get("message_id")), {})
747
+ from_self = (row.get("role") == "user")
748
+ sender_id = mapped.get("sender_id") or p.get("sender_id") or row.get("sender_id")
749
+ if not sender_id:
750
+ sender_id = "self" if from_self else f"unknown:{p.get('thread_id') or p.get('message_id') or 'signal'}"
751
+ staging_records.append({
752
+ "message_id": mapped.get("message_id") or p.get("message_id"),
753
+ "dataset_id": dataset_id,
754
+ "thread_id": mapped.get("thread_id") or mapped.get("conversation_id") or p.get("thread_id") or p.get("conversation_id") or dataset_id,
755
+ "ts": mapped.get("ts") or p.get("ts") or datetime.now(timezone.utc).isoformat(),
756
+ "sender_type": "self" if from_self else "contact",
757
+ "sender_id": str(sender_id),
758
+ "reply_to_message_id": mapped.get("reply_to_message_id") or p.get("reply_to_message_id"),
759
+ "message_type": mapped.get("message_type") or p.get("message_type"),
760
+ "event_type": mapped.get("event_type") or p.get("event_type"),
761
+ "content": mapped.get("content") if mapped.get("content") is not None else p.get("content"),
762
+ "source_id": SOURCE_ID_SIGNAL,
763
+ "from_self": from_self,
764
+ "owner_user_id": owner_user_id,
765
+ })
766
+ if "_metadata" in mapped:
767
+ staging_records[-1]["_metadata"] = mapped["_metadata"]
768
+ elif "_metadata" in p:
769
+ staging_records[-1]["_metadata"] = p["_metadata"]
770
+
771
+ _resolve_signal_reply_links(
772
+ db_conn=db_conn,
773
+ dataset_id=dataset_id,
774
+ staging_records=staging_records,
775
+ )
776
+
777
+ try:
778
+ manager.upsert_message_batch(staging_records, dataset_id, SOURCE_ID_SIGNAL)
779
+ _backfill_signal_reply_links_in_db(db_conn=db_conn, dataset_id=dataset_id)
780
+ except Exception as e:
781
+ logger.exception("Signal sync: upsert_message_batch failed")
782
+ return {"status": "error", "error": str(e), "records_processed": total_processed}
783
+
784
+ canonical_messages = [
785
+ {
786
+ "message_id": rec.get("message_id"),
787
+ "conversation_id": rec.get("thread_id") or dataset_id,
788
+ "sender_type": rec.get("sender_type"),
789
+ "sender_id": rec.get("sender_id"),
790
+ "reply_to_message_id": rec.get("reply_to_message_id"),
791
+ "message_type": rec.get("message_type"),
792
+ "event_type": rec.get("event_type"),
793
+ "ts": rec.get("ts"),
794
+ "content": rec.get("content"),
795
+ "source_id": SOURCE_ID_SIGNAL,
796
+ }
797
+ for rec in staging_records
798
+ ]
799
+ _run_local_sync_enrichment_if_enabled(
800
+ db_conn=db_conn,
801
+ source_id=SOURCE_ID_SIGNAL,
802
+ canonical_messages=canonical_messages,
803
+ )
804
+
805
+ total_processed += len(row_norm_pairs)
806
+ if max_sent_at is not None:
807
+ final_last_record_id = f"signal:0:{max_sent_at:.6f}"
808
+ store.save_checkpoint(IngestionCheckpoint(
809
+ dataset_id=dataset_id,
810
+ schema_id=SIGNAL_SCHEMA_ID,
811
+ last_record_id=final_last_record_id,
812
+ metadata={},
813
+ ))
814
+ current_last_record_id = final_last_record_id
815
+
816
+ if len(rows) < batch_size:
817
+ break
818
+
819
+ return {
820
+ "status": "ok",
821
+ "records_processed": total_processed,
822
+ "last_record_id": final_last_record_id,
823
+ }