topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,479 @@
1
+ """iMessage reader: copy chat.db to temp (or open read-only), query messages since checkpoint.
2
+
3
+ Requires macOS and Full Disk Access for ~/Library/Messages/chat.db.
4
+ Uses chunked copy to support chat.db larger than ~2GB (avoids errno 84 EOVERFLOW from sendfile).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import errno
10
+ import logging
11
+ import os
12
+ import plistlib
13
+ import re
14
+ import sqlite3
15
+ import tempfile
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Iterator, Optional
18
+
19
+ logger = logging.getLogger("topos.ingestion.sources.imessage_reader")
20
+
21
+ # Mac epoch: seconds between 2001-01-01 and 1970-01-01
22
+ MAC_EPOCH_OFFSET = 978307200
23
+
24
+ DEFAULT_CHAT_DB_PATH = Path.home() / "Library" / "Messages" / "chat.db"
25
+
26
+ # Chunk size for copy (avoids sendfile/stat overflow on files > ~2GB)
27
+ COPY_CHUNK_SIZE = 8 * 1024 * 1024 # 8 MiB
28
+
29
+
30
+ def get_chat_db_path() -> Path:
31
+ """Return path to iMessage chat.db (macOS)."""
32
+ return Path(os.environ.get("IMESSAGE_CHAT_DB", str(DEFAULT_CHAT_DB_PATH)))
33
+
34
+
35
+ def mac_epoch_to_unix(mac_date: Optional[int]) -> Optional[float]:
36
+ """Convert iMessage date to Unix timestamp.
37
+
38
+ Apple message.date can be stored in seconds, milliseconds, microseconds, or
39
+ nanoseconds since 2001-01-01 depending on OS/version/export path. Normalize
40
+ to seconds before adding MAC epoch offset.
41
+ """
42
+ if mac_date is None:
43
+ return None
44
+ value = float(mac_date)
45
+ abs_value = abs(value)
46
+ # Heuristics by magnitude:
47
+ # - seconds since 2001: ~1e9
48
+ # - milliseconds: ~1e12
49
+ # - microseconds: ~1e15
50
+ # - nanoseconds: ~1e18
51
+ if abs_value >= 1e17:
52
+ value = value / 1_000_000_000.0
53
+ elif abs_value >= 1e14:
54
+ value = value / 1_000_000.0
55
+ elif abs_value >= 1e11:
56
+ value = value / 1_000.0
57
+ return value + MAC_EPOCH_OFFSET
58
+
59
+
60
+ def _copy_large_file(src: Path, dst: str, show_progress: bool = True) -> None:
61
+ """Copy file in chunks using os.open/os.read/os.write only, to avoid EOVERFLOW (errno 84) on any system.
62
+ Optional progress bar when size is available (stat may raise 84 on large files; we catch and skip bar).
63
+ """
64
+ total_size: Optional[int] = None
65
+ if show_progress:
66
+ try:
67
+ total_size = src.stat().st_size
68
+ except OSError as e:
69
+ if getattr(e, "errno", None) == errno.EOVERFLOW:
70
+ logger.debug("chat.db size overflow (EOVERFLOW), copying without progress bar")
71
+ total_size = None
72
+
73
+ pbar = None
74
+ if show_progress and total_size is not None and total_size > 0:
75
+ from topos.enrichment.progress_bar import ProgressBar
76
+ pbar = ProgressBar(total=total_size, desc="Copying chat.db", width=40)
77
+ pbar.__enter__()
78
+
79
+ fd_in = fd_out = None
80
+ try:
81
+ try:
82
+ fd_in = os.open(str(src), os.O_RDONLY)
83
+ except OSError as e:
84
+ if getattr(e, "errno", None) == errno.EOVERFLOW:
85
+ logger.warning(
86
+ "EOVERFLOW opening source chat.db (file may be too large for this system): path=%s",
87
+ src,
88
+ )
89
+ raise
90
+ fd_out = os.open(dst, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
91
+ while True:
92
+ chunk = os.read(fd_in, COPY_CHUNK_SIZE)
93
+ if not chunk:
94
+ break
95
+ os.write(fd_out, chunk)
96
+ if pbar is not None:
97
+ pbar.update(len(chunk))
98
+ finally:
99
+ if fd_in is not None:
100
+ try:
101
+ os.close(fd_in)
102
+ except OSError:
103
+ pass
104
+ if fd_out is not None:
105
+ try:
106
+ os.close(fd_out)
107
+ except OSError:
108
+ pass
109
+ if pbar is not None:
110
+ pbar.close()
111
+
112
+
113
+ def _normalize_sender_id(value: Any) -> Optional[str]:
114
+ """Normalize sender identity from handle.id for storage."""
115
+ if value is None:
116
+ return None
117
+ text = str(value).strip()
118
+ return text or None
119
+
120
+
121
+ def _extract_text_from_plist(obj: Any) -> list[str]:
122
+ """Recursively pull likely text values from parsed plist structures."""
123
+ out: list[str] = []
124
+ if isinstance(obj, str):
125
+ s = " ".join(obj.split()).strip()
126
+ if s and any(ch.isalpha() for ch in s):
127
+ out.append(s)
128
+ elif isinstance(obj, dict):
129
+ for k, v in obj.items():
130
+ # Skip obviously structural keys.
131
+ if isinstance(k, str) and k in {
132
+ "$archiver", "$version", "$objects", "$top", "$class",
133
+ "NS.keys", "NS.objects",
134
+ }:
135
+ continue
136
+ out.extend(_extract_text_from_plist(v))
137
+ elif isinstance(obj, (list, tuple, set)):
138
+ for item in obj:
139
+ out.extend(_extract_text_from_plist(item))
140
+ return out
141
+
142
+
143
+ def _looks_like_archive_noise(s: str) -> bool:
144
+ low = s.lower()
145
+ return (
146
+ low.startswith("ns.")
147
+ or "nskeyedarchiver" in low
148
+ or "nsdictionary" in low
149
+ or "nsmutablestring" in low
150
+ or "nsnumber" in low
151
+ or "attribute" in low
152
+ or low in {"bplist00", "$objects", "$top", "$version", "$archiver"}
153
+ )
154
+
155
+
156
+ def _extract_utf8_text_candidates(raw: bytes) -> list[str]:
157
+ """Extract likely human text from UTF-8 byte payloads only.
158
+
159
+ We intentionally avoid utf-16/latin blind decoding to prevent fake CJK
160
+ gibberish from archive bytes interpreted with wrong encodings.
161
+ """
162
+ try:
163
+ decoded = raw.decode("utf-8", errors="ignore")
164
+ except Exception:
165
+ return []
166
+ candidates: list[str] = []
167
+ for match in re.findall(r"[^\x00-\x1F]{4,}", decoded):
168
+ s = " ".join(match.split()).strip()
169
+ if not s:
170
+ continue
171
+ if _looks_like_archive_noise(s):
172
+ continue
173
+ # Keep likely natural-language strings; avoid purely symbolic fragments.
174
+ alpha = sum(1 for ch in s if ch.isalpha())
175
+ if alpha < 3:
176
+ continue
177
+ candidates.append(s)
178
+ return candidates
179
+
180
+
181
+ def _extract_text_from_attributed_body(value: Any) -> Optional[str]:
182
+ """Best-effort extraction of human text from iMessage attributedBody blobs.
183
+
184
+ Important: do NOT decode arbitrary bytes as plain text. That produces
185
+ garbage strings (often CJK-looking) when archive bytes are interpreted with
186
+ the wrong encoding.
187
+ """
188
+ if value is None:
189
+ return None
190
+ if isinstance(value, str):
191
+ s = " ".join(value.split()).strip()
192
+ return s or None
193
+ if not isinstance(value, (bytes, bytearray, memoryview)):
194
+ return None
195
+ raw = bytes(value)
196
+ candidates: list[str] = []
197
+
198
+ # Many modern iMessage attributedBody fields are keyed archive plists.
199
+ if raw.startswith(b"bplist00"):
200
+ try:
201
+ plist_obj = plistlib.loads(raw)
202
+ candidates.extend(_extract_text_from_plist(plist_obj))
203
+ except Exception:
204
+ pass
205
+
206
+ # Some attributedBody payloads are not bplist but still contain UTF-8 text.
207
+ if not candidates:
208
+ candidates.extend(_extract_utf8_text_candidates(raw))
209
+ if not candidates:
210
+ return None
211
+
212
+ # Filter out noisy/internal archive strings and pick best candidate.
213
+ filtered = [s for s in candidates if not _looks_like_archive_noise(s)]
214
+ if not filtered:
215
+ return None
216
+ best = max(filtered, key=len)
217
+ best = best.replace("\ufffc", "").strip()
218
+ return best or None
219
+
220
+
221
+ def _build_content_from_row(row: Dict[str, Any]) -> Optional[str]:
222
+ """Build content string for iMessage rows, including non-text message forms."""
223
+ text = (row.get("text") or "").strip()
224
+ if text:
225
+ return text
226
+
227
+ attributed_text = _extract_text_from_attributed_body(row.get("attributed_body"))
228
+ if attributed_text:
229
+ return attributed_text
230
+
231
+ subject = (row.get("subject") or "").strip()
232
+ if subject:
233
+ return subject
234
+
235
+ # Handle tapbacks / reaction-style records where text is empty.
236
+ associated_guid = row.get("associated_message_guid")
237
+ associated_type = row.get("associated_message_type")
238
+ if associated_guid:
239
+ return f"[reaction:{associated_type}]"
240
+
241
+ if row.get("cache_has_attachments"):
242
+ return "[attachment]"
243
+
244
+ item_type = row.get("item_type")
245
+ if item_type not in (None, 0, "0"):
246
+ return f"[system_event:item_type={item_type}]"
247
+
248
+ return None
249
+
250
+
251
+ def _extract_imessage_context(row: Dict[str, Any]) -> Dict[str, Any]:
252
+ """Extract unified reply/system context for canonical mapping."""
253
+ metadata: Dict[str, Any] = {}
254
+ reply_to_message_id: Optional[str] = None
255
+ message_type = "message"
256
+ event_type: Optional[str] = None
257
+
258
+ thread_originator_guid = row.get("thread_originator_guid")
259
+ if thread_originator_guid:
260
+ reply_to_message_id = str(thread_originator_guid)
261
+ metadata["thread_originator_guid"] = str(thread_originator_guid)
262
+ if row.get("thread_originator_part") is not None:
263
+ metadata["thread_originator_part"] = row.get("thread_originator_part")
264
+
265
+ associated_guid = row.get("associated_message_guid")
266
+ associated_type = row.get("associated_message_type")
267
+ if associated_guid:
268
+ metadata["associated_message_guid"] = str(associated_guid)
269
+ if associated_type is not None:
270
+ metadata["associated_message_type"] = associated_type
271
+
272
+ item_type = row.get("item_type")
273
+ if item_type not in (None, 0, "0"):
274
+ message_type = "system"
275
+ event_type = f"imessage_item_type:{item_type}"
276
+ metadata["item_type"] = item_type
277
+
278
+ group_action_type = row.get("group_action_type")
279
+ if group_action_type not in (None, 0, "0"):
280
+ message_type = "system"
281
+ event_type = f"imessage_group_action:{group_action_type}"
282
+ metadata["group_action_type"] = group_action_type
283
+
284
+ if row.get("message_guid"):
285
+ metadata["message_guid"] = str(row.get("message_guid"))
286
+ if row.get("chat_guid"):
287
+ metadata["chat_guid"] = str(row.get("chat_guid"))
288
+ if row.get("chat_identifier"):
289
+ metadata["chat_identifier"] = str(row.get("chat_identifier"))
290
+
291
+ result: Dict[str, Any] = {
292
+ "message_type": message_type,
293
+ "event_type": event_type,
294
+ }
295
+ if reply_to_message_id:
296
+ result["reply_to_message_id"] = reply_to_message_id
297
+ if metadata:
298
+ result["_metadata"] = metadata
299
+ return result
300
+
301
+
302
+ def read_imessage_rows(
303
+ last_rowid: Optional[str] = None,
304
+ chat_db_path: Optional[Path] = None,
305
+ batch_size: int = 5000,
306
+ start_unix: Optional[float] = None,
307
+ ) -> Iterator[Dict[str, Any]]:
308
+ """
309
+ Copy chat.db to a temp file (chunked to support >2GB), query messages with ROWID > last_rowid, yield rows as dicts.
310
+ Each row has: id (imessage:ROWID), thread_id (str chat_id), content (text), created_at (Unix ts), role (user/other from is_from_me).
311
+ """
312
+ path = chat_db_path or get_chat_db_path()
313
+ if not path.exists():
314
+ raise FileNotFoundError(f"chat.db not found at {path}; Full Disk Access may be required")
315
+ copy_path = None
316
+ try:
317
+ fd, copy_path = tempfile.mkstemp(suffix=".db", prefix="topos_imessage_")
318
+ os.close(fd)
319
+ try:
320
+ _copy_large_file(path, copy_path)
321
+ except OSError as e:
322
+ if getattr(e, "errno", None) == errno.EOVERFLOW:
323
+ try:
324
+ _copy_large_file(path, copy_path, show_progress=False)
325
+ except (OSError, PermissionError) as retry_e:
326
+ raise PermissionError(f"Cannot copy chat.db: {retry_e}. Full Disk Access may be required.") from retry_e
327
+ else:
328
+ raise PermissionError(f"Cannot copy chat.db: {e}. Full Disk Access may be required.") from e
329
+ except PermissionError as e:
330
+ raise
331
+ except Exception:
332
+ if copy_path and os.path.exists(copy_path):
333
+ try:
334
+ os.unlink(copy_path)
335
+ except OSError:
336
+ pass
337
+ raise
338
+ try:
339
+ try:
340
+ conn = sqlite3.connect(copy_path)
341
+ except OSError as e:
342
+ if getattr(e, "errno", None) == errno.EOVERFLOW:
343
+ logger.warning(
344
+ "EOVERFLOW opening copied chat.db with SQLite (copied file may be too large): %s",
345
+ copy_path,
346
+ )
347
+ raise
348
+ conn.row_factory = sqlite3.Row
349
+ try:
350
+ message_columns = {
351
+ str(r["name"])
352
+ for r in conn.execute("PRAGMA table_info(message)").fetchall()
353
+ if r["name"]
354
+ }
355
+ chat_columns = {
356
+ str(r["name"])
357
+ for r in conn.execute("PRAGMA table_info(chat)").fetchall()
358
+ if r["name"]
359
+ }
360
+
361
+ def _message_col_or_null(column: str, alias: str) -> str:
362
+ if column in message_columns:
363
+ return f"message.{column} AS {alias}"
364
+ return f"NULL AS {alias}"
365
+
366
+ def _chat_col_or_null(column: str, alias: str) -> str:
367
+ if column in chat_columns:
368
+ return f"chat.{column} AS {alias}"
369
+ return f"NULL AS {alias}"
370
+
371
+ last = 0
372
+ if last_rowid:
373
+ # last_record_id may be "imessage:12345" or "12345"
374
+ raw = last_rowid.split(":")[-1]
375
+ try:
376
+ last = int(raw)
377
+ except ValueError:
378
+ pass
379
+ mac_start_seconds = None
380
+ if start_unix is not None:
381
+ mac_start_seconds = float(start_unix) - MAC_EPOCH_OFFSET
382
+ # Include non-text message forms too; content is synthesized when text is absent.
383
+ query = f"""
384
+ SELECT message.ROWID AS rowid,
385
+ message.text AS text,
386
+ message.subject AS subject,
387
+ message.attributedBody AS attributed_body,
388
+ message.associated_message_guid AS associated_message_guid,
389
+ message.associated_message_type AS associated_message_type,
390
+ message.cache_has_attachments AS cache_has_attachments,
391
+ message.item_type AS item_type,
392
+ {_message_col_or_null("group_action_type", "group_action_type")},
393
+ {_message_col_or_null("thread_originator_guid", "thread_originator_guid")},
394
+ {_message_col_or_null("thread_originator_part", "thread_originator_part")},
395
+ {_message_col_or_null("guid", "message_guid")},
396
+ message.date AS date,
397
+ message.handle_id AS handle_id,
398
+ message.is_from_me AS is_from_me,
399
+ handle.id AS sender_id,
400
+ chat.ROWID AS chat_id,
401
+ {_chat_col_or_null("guid", "chat_guid")},
402
+ {_chat_col_or_null("chat_identifier", "chat_identifier")}
403
+ FROM message
404
+ JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
405
+ JOIN chat ON chat.ROWID = chat_message_join.chat_id
406
+ LEFT JOIN handle ON handle.ROWID = message.handle_id
407
+ WHERE message.ROWID > ?
408
+ AND (
409
+ ? IS NULL
410
+ OR (
411
+ CASE
412
+ WHEN abs(message.date) >= 100000000000000000 THEN (message.date / 1000000000.0)
413
+ WHEN abs(message.date) >= 100000000000000 THEN (message.date / 1000000.0)
414
+ WHEN abs(message.date) >= 100000000000 THEN (message.date / 1000.0)
415
+ ELSE (message.date * 1.0)
416
+ END
417
+ ) >= ?
418
+ )
419
+ ORDER BY message.ROWID
420
+ LIMIT ?
421
+ """
422
+ cursor = conn.execute(query, (last, mac_start_seconds, mac_start_seconds, batch_size))
423
+ for row in cursor:
424
+ r = dict(row)
425
+ rowid = r["rowid"]
426
+ content = _build_content_from_row(r)
427
+ if not content:
428
+ continue
429
+ mac_date = r.get("date")
430
+ unix_ts = mac_epoch_to_unix(mac_date) if mac_date is not None else None
431
+ is_from_me = r.get("is_from_me", 0)
432
+ role = "user" if is_from_me else "other"
433
+ context = _extract_imessage_context(r)
434
+ if is_from_me:
435
+ sender_id = "self"
436
+ else:
437
+ sender_id = _normalize_sender_id(r.get("sender_id")) or f"unknown:{r.get('handle_id')}"
438
+ out = {
439
+ "id": f"imessage:{rowid}",
440
+ "thread_id": str(r.get("chat_id", "")),
441
+ "content": content,
442
+ "created_at": unix_ts,
443
+ "role": role,
444
+ "sender_id": sender_id,
445
+ "ROWID": rowid,
446
+ }
447
+ if context.get("reply_to_message_id"):
448
+ out["reply_to_message_id"] = context["reply_to_message_id"]
449
+ if context.get("message_type"):
450
+ out["message_type"] = context["message_type"]
451
+ if context.get("event_type"):
452
+ out["event_type"] = context["event_type"]
453
+ if context.get("_metadata"):
454
+ out["_metadata"] = context["_metadata"]
455
+ yield out
456
+ finally:
457
+ conn.close()
458
+ finally:
459
+ try:
460
+ os.unlink(copy_path)
461
+ except OSError:
462
+ pass
463
+
464
+
465
+ def read_imessage_rows_list(
466
+ last_rowid: Optional[str] = None,
467
+ chat_db_path: Optional[Path] = None,
468
+ batch_size: int = 5000,
469
+ start_unix: Optional[float] = None,
470
+ ) -> list[Dict[str, Any]]:
471
+ """Convenience: consume iterator into a list."""
472
+ return list(
473
+ read_imessage_rows(
474
+ last_rowid=last_rowid,
475
+ chat_db_path=chat_db_path,
476
+ batch_size=batch_size,
477
+ start_unix=start_unix,
478
+ )
479
+ )
@@ -0,0 +1,132 @@
1
+ """Parse Signal export files (JSON) into normalized records for conversation_messages.
2
+
3
+ Supported format: JSON array of message objects. Each object may have:
4
+ - conversationId or conversation_id
5
+ - body or content
6
+ - sent_at (ms or sec) or created_at
7
+ - type: "outgoing" | "incoming" (for from_self)
8
+ - source or sender (phone number for identity matching)
9
+
10
+ message_id is stable: signal_import:{conversation_id}:{sent_at}:{content_hash} for idempotent re-upload.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import hashlib
16
+ import json
17
+ import logging
18
+ from datetime import datetime, timezone
19
+ from typing import Any, Dict, List, Optional
20
+
21
+ logger = logging.getLogger("topos.ingestion.sources.signal_export_parser")
22
+
23
+
24
+ def _norm_ts(sent_at: Any) -> str:
25
+ """Normalize sent_at (ms or sec) to ISO ts string."""
26
+ if sent_at is None:
27
+ return datetime.now(timezone.utc).isoformat()
28
+ if isinstance(sent_at, str):
29
+ return sent_at
30
+ if isinstance(sent_at, (int, float)):
31
+ if sent_at > 1e12: # milliseconds
32
+ sent_at = sent_at / 1000.0
33
+ return datetime.fromtimestamp(sent_at, tz=timezone.utc).isoformat()
34
+ return str(sent_at)
35
+
36
+
37
+ def _stable_message_id(conversation_id: str, sent_at: Any, content: str) -> str:
38
+ """Stable id for idempotent upsert."""
39
+ raw = f"{conversation_id}:{sent_at}:{content}"
40
+ h = hashlib.sha256(raw.encode("utf-8", errors="replace")).hexdigest()[:12]
41
+ ts = int(sent_at) if isinstance(sent_at, (int, float)) else 0
42
+ if ts > 1e12:
43
+ ts = int(ts / 1000)
44
+ return f"signal_import:{conversation_id}:{ts}:{h}"
45
+
46
+
47
+ def parse_signal_export_json(
48
+ data: bytes | str,
49
+ *,
50
+ my_phone_number: Optional[str] = None,
51
+ owner_user_id: Optional[str] = None,
52
+ ) -> List[Dict[str, Any]]:
53
+ """
54
+ Parse JSON export (array of message objects) into staging records for ConversationsTablesManager.
55
+ Each record has: message_id, conversation_id, ts, sender_type (self|contact), content, source_id=signal,
56
+ from_self, owner_user_id (if identity provided).
57
+ """
58
+ if isinstance(data, bytes):
59
+ data = data.decode("utf-8", errors="replace")
60
+ try:
61
+ arr = json.loads(data)
62
+ except json.JSONDecodeError as e:
63
+ raise ValueError(f"Invalid JSON: {e}") from e
64
+ if not isinstance(arr, list):
65
+ arr = [arr]
66
+ records: List[Dict[str, Any]] = []
67
+ for i, obj in enumerate(arr):
68
+ if not isinstance(obj, dict):
69
+ continue
70
+ conv_id = str(obj.get("conversationId") or obj.get("conversation_id") or f"conv_{i}")
71
+ body = obj.get("body") or obj.get("content") or ""
72
+ sent_at = obj.get("sent_at") or obj.get("created_at") or obj.get("date")
73
+ msg_type = (obj.get("type") or "").lower()
74
+ source_phone = obj.get("source") or obj.get("sender") or obj.get("sender_phone")
75
+ if isinstance(source_phone, dict):
76
+ source_phone = source_phone.get("number") or source_phone.get("phone")
77
+ source_phone = str(source_phone).strip() if source_phone else None
78
+
79
+ from_self = msg_type == "outgoing"
80
+ if my_phone_number and source_phone:
81
+ norm_phone = my_phone_number.replace(" ", "").replace("-", "").strip()
82
+ norm_source = (source_phone or "").replace(" ", "").replace("-", "").strip()
83
+ if norm_phone and norm_source and norm_phone in norm_source or norm_source in norm_phone:
84
+ from_self = True
85
+ elif msg_type == "outgoing":
86
+ from_self = True
87
+ sender_type = "self" if from_self else "contact"
88
+ message_type = "system" if msg_type and msg_type not in {"outgoing", "incoming"} else "message"
89
+ event_type = f"signal_type:{msg_type}" if message_type == "system" else None
90
+ reply_to_message_id = (
91
+ obj.get("quoteId")
92
+ or obj.get("quotedMessageId")
93
+ or obj.get("replyToMessageId")
94
+ or obj.get("reply_to_message_id")
95
+ )
96
+
97
+ message_id = _stable_message_id(conv_id, sent_at, body)
98
+ ts = _norm_ts(sent_at)
99
+ content = body
100
+ if not content and message_type == "system":
101
+ content = f"[system_event:{msg_type}]"
102
+ rec = {
103
+ "message_id": message_id,
104
+ "conversation_id": conv_id,
105
+ "thread_id": conv_id,
106
+ "ts": ts,
107
+ "sender_type": sender_type,
108
+ "content": content,
109
+ "source_id": "signal",
110
+ "from_self": from_self,
111
+ "sender_id": source_phone,
112
+ "message_type": message_type,
113
+ "event_type": event_type,
114
+ }
115
+ if reply_to_message_id is not None:
116
+ rec["reply_to_message_id"] = str(reply_to_message_id)
117
+ metadata = {}
118
+ for key in (
119
+ "quoteId", "quotedMessageId", "replyToMessageId", "reply_to_message_id",
120
+ "quoteAuthorAci", "quoteAuthorUuid", "quoteAuthor", "quoteText", "quoteBody",
121
+ "storyReplyContext", "groupV2Change", "groupUpdate", "groupChange",
122
+ "callId", "callHistoryDetails", "expiresTimer", "expirationStartTimestamp",
123
+ "isErased", "isViewOnce", "isStory",
124
+ ):
125
+ if key in obj and obj.get(key) is not None:
126
+ metadata[key] = obj.get(key)
127
+ if metadata:
128
+ rec["_metadata"] = metadata
129
+ if owner_user_id:
130
+ rec["owner_user_id"] = owner_user_id
131
+ records.append(rec)
132
+ return records