topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,491 @@
1
+ """Signal Desktop DB reader: open SQLCipher DB, query messages since checkpoint.
2
+
3
+ Requires pysqlcipher3 (pip install pysqlcipher3). Key from ~/Library/Application Support/Signal/config.json.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import base64
9
+ import importlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import re
14
+ import subprocess
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Optional
18
+
19
+ logger = logging.getLogger("topos.ingestion.sources.signal_reader")
20
+
21
+ DEFAULT_SIGNAL_DIR = Path.home() / "Library" / "Application Support" / "Signal"
22
+ DEFAULT_DB_PATH = DEFAULT_SIGNAL_DIR / "sql" / "db.sqlite"
23
+ DEFAULT_CONFIG_PATH = DEFAULT_SIGNAL_DIR / "config.json"
24
+
25
+
26
+ def get_signal_paths() -> tuple[Path, Path]:
27
+ """Return (config_path, db_path). Override with env SIGNAL_CONFIG_PATH, SIGNAL_DB_PATH."""
28
+ config = Path(os.environ.get("SIGNAL_CONFIG_PATH", str(DEFAULT_CONFIG_PATH)))
29
+ db = Path(os.environ.get("SIGNAL_DB_PATH", str(DEFAULT_DB_PATH)))
30
+ return config, db
31
+
32
+
33
+ def _normalize_hex_key(value: Any) -> Optional[str]:
34
+ if not isinstance(value, str):
35
+ return None
36
+ stripped = value.strip()
37
+ if stripped.startswith("0x"):
38
+ stripped = stripped[2:]
39
+ if stripped.startswith("x'") and stripped.endswith("'") and len(stripped) >= 4:
40
+ stripped = stripped[2:-1]
41
+ if re.fullmatch(r"[0-9a-fA-F]+", stripped):
42
+ return stripped
43
+ return None
44
+
45
+
46
+ def _get_macos_safe_storage_password() -> Optional[str]:
47
+ """Best-effort retrieval of Signal Safe Storage password from Keychain."""
48
+ if sys.platform != "darwin":
49
+ return None
50
+ services = [
51
+ "Signal Safe Storage",
52
+ "Signal",
53
+ ]
54
+ for service in services:
55
+ try:
56
+ proc = subprocess.run(
57
+ ["security", "find-generic-password", "-s", service, "-w"],
58
+ capture_output=True,
59
+ text=True,
60
+ check=False,
61
+ )
62
+ if proc.returncode == 0 and proc.stdout.strip():
63
+ return proc.stdout.strip()
64
+ except Exception:
65
+ continue
66
+ return None
67
+
68
+
69
+ def _decrypt_signal_encrypted_key(encrypted_key: str) -> Optional[str]:
70
+ """Decrypt Electron safeStorage encryptedKey on macOS.
71
+
72
+ Signal Desktop stores encryptedKey via Electron safeStorage. On macOS, this
73
+ can be decrypted with the "Signal Safe Storage" keychain secret.
74
+ """
75
+ if sys.platform != "darwin":
76
+ return None
77
+
78
+ safe_storage_password = _get_macos_safe_storage_password()
79
+ if not safe_storage_password:
80
+ logger.warning("Signal encryptedKey present but Safe Storage password was not found in Keychain")
81
+ return None
82
+
83
+ raw: Optional[bytes] = None
84
+ try:
85
+ raw = base64.b64decode(encrypted_key)
86
+ except Exception:
87
+ try:
88
+ raw = bytes.fromhex(encrypted_key)
89
+ except Exception:
90
+ raw = None
91
+ if not raw:
92
+ logger.warning("Signal encryptedKey format is not base64/hex-decodable")
93
+ return None
94
+
95
+ if raw.startswith(b"v10"):
96
+ raw = raw[3:]
97
+ if not raw:
98
+ return None
99
+
100
+ try:
101
+ backends_mod = importlib.import_module("cryptography.hazmat.backends")
102
+ primitives_mod = importlib.import_module("cryptography.hazmat.primitives")
103
+ ciphers_mod = importlib.import_module("cryptography.hazmat.primitives.ciphers")
104
+ pbkdf2_mod = importlib.import_module("cryptography.hazmat.primitives.kdf.pbkdf2")
105
+ default_backend = getattr(backends_mod, "default_backend")
106
+ hashes = getattr(primitives_mod, "hashes")
107
+ Cipher = getattr(ciphers_mod, "Cipher")
108
+ algorithms = getattr(ciphers_mod, "algorithms")
109
+ modes = getattr(ciphers_mod, "modes")
110
+ PBKDF2HMAC = getattr(pbkdf2_mod, "PBKDF2HMAC")
111
+ except Exception as e:
112
+ logger.warning("cryptography import failed for Signal encryptedKey decrypt: %s", e)
113
+ return None
114
+
115
+ try:
116
+ # Electron/Chromium OSCrypt compatibility (macOS).
117
+ kdf = PBKDF2HMAC(
118
+ algorithm=hashes.SHA1(),
119
+ length=16,
120
+ salt=b"saltysalt",
121
+ iterations=1003,
122
+ backend=default_backend(),
123
+ )
124
+ aes_key = kdf.derive(safe_storage_password.encode("utf-8"))
125
+ cipher = Cipher(algorithms.AES(aes_key), modes.CBC(b" " * 16), backend=default_backend())
126
+ decryptor = cipher.decryptor()
127
+ plaintext = decryptor.update(raw) + decryptor.finalize()
128
+ except Exception as e:
129
+ logger.warning("Signal encryptedKey decryption failed: %s", e)
130
+ return None
131
+
132
+ # PKCS#7 unpadding
133
+ if plaintext:
134
+ pad_len = plaintext[-1]
135
+ if 1 <= pad_len <= 16 and plaintext.endswith(bytes([pad_len]) * pad_len):
136
+ plaintext = plaintext[:-pad_len]
137
+
138
+ # First try direct text forms.
139
+ text = plaintext.decode("utf-8", errors="ignore").strip().strip("\x00")
140
+ normalized = _normalize_hex_key(text)
141
+ if normalized:
142
+ return normalized
143
+
144
+ # Some builds may store binary key material; fall back to hex-encoding bytes.
145
+ binary_hex = plaintext.hex()
146
+ if binary_hex:
147
+ return binary_hex
148
+ return None
149
+
150
+
151
+ def get_signal_key(config_path: Optional[Path] = None, preferred_hex_key: Optional[str] = None) -> Optional[str]:
152
+ """Read raw SQLCipher key from Signal config.json.
153
+
154
+ Note: `encryptedKey` is not a raw SQLCipher key and cannot be used directly.
155
+ """
156
+ if isinstance(preferred_hex_key, str) and preferred_hex_key.strip():
157
+ normalized_preferred_key = _normalize_hex_key(preferred_hex_key)
158
+ if normalized_preferred_key:
159
+ return normalized_preferred_key
160
+ logger.warning("Preferred Signal sync key was provided but is not hex-formatted")
161
+
162
+ env_key = os.environ.get("SIGNAL_KEY_HEX") or os.environ.get("SIGNAL_SQLCIPHER_KEY")
163
+ if isinstance(env_key, str) and env_key.strip():
164
+ normalized_env_key = _normalize_hex_key(env_key)
165
+ if normalized_env_key:
166
+ return normalized_env_key
167
+ logger.warning("SIGNAL_KEY_HEX/SIGNAL_SQLCIPHER_KEY is set but is not hex-formatted")
168
+
169
+ config_path = config_path or get_signal_paths()[0]
170
+ if not config_path.exists():
171
+ return None
172
+ try:
173
+ with open(config_path, encoding="utf-8") as f:
174
+ data = json.load(f)
175
+ key = _normalize_hex_key(data.get("key"))
176
+ if key:
177
+ return key
178
+ if isinstance(data.get("key"), str):
179
+ logger.warning("Signal config key exists but is not hex-formatted")
180
+
181
+ encrypted_key = data.get("encryptedKey")
182
+ if isinstance(encrypted_key, str) and encrypted_key.strip():
183
+ decrypted = _decrypt_signal_encrypted_key(encrypted_key.strip())
184
+ if decrypted:
185
+ logger.info("Signal encryptedKey decrypted via Keychain")
186
+ return decrypted
187
+ logger.warning("Signal config has encryptedKey but decryption failed")
188
+ return None
189
+ except Exception as e:
190
+ logger.warning("get_signal_key failed: %s", e)
191
+ return None
192
+
193
+
194
+ def _normalize_signal_ts_seconds(value: Any) -> Optional[float]:
195
+ """Normalize Signal timestamp values to Unix seconds."""
196
+ if value is None:
197
+ return None
198
+ try:
199
+ ts = float(value)
200
+ except Exception:
201
+ return None
202
+ abs_ts = abs(ts)
203
+ if abs_ts >= 1e17:
204
+ ts = ts / 1_000_000_000.0
205
+ elif abs_ts >= 1e14:
206
+ ts = ts / 1_000_000.0
207
+ elif abs_ts >= 1e11:
208
+ ts = ts / 1_000.0
209
+ return ts
210
+
211
+
212
+ def _normalize_signal_sender_id(value: Any) -> Optional[str]:
213
+ if value is None:
214
+ return None
215
+ text = str(value).strip()
216
+ return text or None
217
+
218
+
219
+ def _safe_json_loads(value: Any) -> Optional[Dict[str, Any]]:
220
+ if value is None:
221
+ return None
222
+ if isinstance(value, dict):
223
+ return value
224
+ if not isinstance(value, str):
225
+ return None
226
+ text = value.strip()
227
+ if not text:
228
+ return None
229
+ try:
230
+ parsed = json.loads(text)
231
+ except Exception:
232
+ return None
233
+ return parsed if isinstance(parsed, dict) else None
234
+
235
+
236
+ def _extract_reply_from_signal_json(payload: Dict[str, Any]) -> tuple[Optional[str], Dict[str, Any]]:
237
+ """Extract reply linkage from Signal JSON payload shape variants."""
238
+ metadata: Dict[str, Any] = {}
239
+ reply_to: Optional[str] = None
240
+
241
+ for key in ("replyToMessageId", "reply_to_message_id", "quotedMessageId", "quoteId"):
242
+ if payload.get(key) is not None:
243
+ reply_to = str(payload.get(key))
244
+ metadata[key] = payload.get(key)
245
+ break
246
+
247
+ quote = payload.get("quote")
248
+ if isinstance(quote, dict):
249
+ metadata["quote"] = quote
250
+ if reply_to is None:
251
+ for key in ("id", "messageId", "message_id", "targetMessageId"):
252
+ if quote.get(key) is not None:
253
+ reply_to = str(quote.get(key))
254
+ break
255
+
256
+ story_ctx = payload.get("storyReplyContext")
257
+ if isinstance(story_ctx, dict):
258
+ metadata["storyReplyContext"] = story_ctx
259
+ if reply_to is None:
260
+ for key in ("messageId", "message_id", "targetMessageId"):
261
+ if story_ctx.get(key) is not None:
262
+ reply_to = str(story_ctx.get(key))
263
+ break
264
+
265
+ return reply_to, metadata
266
+
267
+
268
+ def read_signal_rows(
269
+ last_record_id: Optional[str] = None,
270
+ config_path: Optional[Path] = None,
271
+ db_path: Optional[Path] = None,
272
+ my_phone_number: Optional[str] = None,
273
+ batch_size: int = 5000,
274
+ start_unix: Optional[float] = None,
275
+ signal_key_hex: Optional[str] = None,
276
+ ) -> list[Dict[str, Any]]:
277
+ """
278
+ Open Signal SQLCipher DB and return message rows since last_record_id.
279
+ Each row: id (signal:{id}), thread_id (conversationId), content (body), created_at (Unix), role (user/other from type), ROWID/id.
280
+ """
281
+ try:
282
+ from pysqlcipher3 import dbapi2 as sqlcipher
283
+ except ImportError as e:
284
+ raise ImportError(
285
+ "pysqlcipher3 required for Signal sync. Install with: pip install pysqlcipher3"
286
+ ) from e
287
+
288
+ config_path, db_path = config_path or get_signal_paths()[0], db_path or get_signal_paths()[1]
289
+ if not db_path.exists():
290
+ raise FileNotFoundError(f"Signal DB not found at {db_path}")
291
+ key = get_signal_key(config_path, preferred_hex_key=signal_key_hex)
292
+ if not key:
293
+ raise ValueError(
294
+ "Signal SQLCipher key unavailable. Could not resolve raw key from config.json "
295
+ "or macOS Keychain. Workaround: set SIGNAL_KEY_HEX to a raw SQLCipher hex key."
296
+ )
297
+
298
+ key_hex_expr = f"x'{key}'"
299
+
300
+ conn = None
301
+ last_open_error: Optional[str] = None
302
+ # Different Signal versions/DBs can require different compatibility modes.
303
+ for compat in (4, 3):
304
+ try:
305
+ candidate = sqlcipher.connect(str(db_path))
306
+ if compat is not None:
307
+ candidate.execute(f"PRAGMA cipher_compatibility = {compat}")
308
+ candidate.execute(f'PRAGMA key = "{key_hex_expr}"')
309
+ candidate.execute("SELECT count(*) FROM sqlite_master")
310
+ conn = candidate
311
+ break
312
+ except Exception as e:
313
+ last_open_error = str(e)
314
+ try:
315
+ candidate.close()
316
+ except Exception:
317
+ pass
318
+ continue
319
+
320
+ if conn is None:
321
+ raise ValueError(
322
+ f"Unable to open Signal DB at {db_path} with available SQLCipher settings: "
323
+ f"{last_open_error or 'unknown error'}"
324
+ )
325
+
326
+ conn.row_factory = lambda c, r: dict(zip([col[0] for col in c.description], r))
327
+ try:
328
+ # Signal Desktop schema varies by version; detect available columns first.
329
+ table_info_rows = conn.execute("PRAGMA table_info(messages)").fetchall()
330
+ available_columns = {str(row.get("name") or "") for row in table_info_rows}
331
+
332
+ if "id" not in available_columns or "body" not in available_columns or "sent_at" not in available_columns:
333
+ raise ValueError(
334
+ "Signal messages table is missing required columns (id/body/sent_at). "
335
+ f"Available columns: {sorted(c for c in available_columns if c)}"
336
+ )
337
+
338
+ conversation_col = "conversationId" if "conversationId" in available_columns else (
339
+ "conversation_id" if "conversation_id" in available_columns else None
340
+ )
341
+ if conversation_col is None:
342
+ raise ValueError(
343
+ "Signal messages table is missing conversation column (conversationId/conversation_id). "
344
+ f"Available columns: {sorted(c for c in available_columns if c)}"
345
+ )
346
+
347
+ sender_cols = [c for c in ("sourceServiceId", "sourceUuid", "source") if c in available_columns]
348
+ sender_select = ", " + ", ".join(sender_cols) if sender_cols else ""
349
+ reply_cols = [
350
+ c
351
+ for c in (
352
+ "quoteId",
353
+ "quotedMessageId",
354
+ "replyToMessageId",
355
+ "reply_to_message_id",
356
+ "quoteAuthorAci",
357
+ "quoteAuthorUuid",
358
+ "quoteAuthor",
359
+ "quoteText",
360
+ "quoteBody",
361
+ "storyReplyContext",
362
+ )
363
+ if c in available_columns
364
+ ]
365
+ reply_select = ", " + ", ".join(reply_cols) if reply_cols else ""
366
+ system_cols = [
367
+ c
368
+ for c in (
369
+ "groupV2Change",
370
+ "groupUpdate",
371
+ "groupChange",
372
+ "callId",
373
+ "callHistoryDetails",
374
+ "expiresTimer",
375
+ "expirationStartTimestamp",
376
+ "isErased",
377
+ "isViewOnce",
378
+ "isStory",
379
+ )
380
+ if c in available_columns
381
+ ]
382
+ system_select = ", " + ", ".join(system_cols) if system_cols else ""
383
+ json_cols = [c for c in ("json", "messageJson", "payload_json") if c in available_columns]
384
+ json_select = ", " + ", ".join(json_cols) if json_cols else ""
385
+
386
+ last_ts: float = 0.0
387
+ if last_record_id:
388
+ parts = last_record_id.split(":")
389
+ if len(parts) >= 3 and parts[0] == "signal":
390
+ try:
391
+ last_ts = float(parts[2])
392
+ except ValueError:
393
+ pass
394
+ # Query-side normalization converts sent_at to milliseconds.
395
+ last_ts_ms = float(last_ts) * 1000.0
396
+
397
+ start_ms: Optional[int] = None
398
+ if start_unix is not None:
399
+ try:
400
+ start_ms = int(float(start_unix) * 1000.0)
401
+ except Exception:
402
+ start_ms = None
403
+
404
+ # Signal Desktop: read only columns that exist in this schema variant.
405
+ normalized_sent_at_expr = """
406
+ CASE
407
+ WHEN abs(sent_at) >= 100000000000000000 THEN (sent_at / 1000000.0)
408
+ WHEN abs(sent_at) >= 100000000000000 THEN (sent_at / 1000.0)
409
+ WHEN abs(sent_at) >= 100000000000 THEN (sent_at * 1.0)
410
+ ELSE (sent_at * 1000.0)
411
+ END
412
+ """
413
+
414
+ query = f"""
415
+ SELECT id, body, sent_at, type, {conversation_col} AS conversation_id{sender_select}{reply_select}{system_select}{json_select}
416
+ FROM messages
417
+ WHERE ({normalized_sent_at_expr}) > ?
418
+ AND (
419
+ ? IS NULL
420
+ OR ({normalized_sent_at_expr}) >= ?
421
+ )
422
+ ORDER BY sent_at
423
+ LIMIT ?
424
+ """
425
+ cursor = conn.execute(query, (last_ts_ms, start_ms, start_ms, batch_size))
426
+ rows = cursor.fetchall()
427
+ out = []
428
+ for r in rows:
429
+ msg_id = r.get("id") or r.get("rowid")
430
+ sent_at_seconds = _normalize_signal_ts_seconds(r.get("sent_at"))
431
+ sent_at = sent_at_seconds if sent_at_seconds is not None else 0
432
+ msg_type = (r.get("type") or "").lower()
433
+ role = "user" if msg_type == "outgoing" else "other"
434
+ message_type = "system" if msg_type not in {"outgoing", "incoming"} else "message"
435
+ event_type = f"signal_type:{msg_type}" if message_type == "system" and msg_type else None
436
+ sender_id = _normalize_signal_sender_id(next((r.get(c) for c in sender_cols if r.get(c)), None))
437
+ if role == "user":
438
+ sender_id = "self"
439
+ if not sender_id:
440
+ sender_id = f"unknown:{msg_id}"
441
+ reply_to_message_id = next(
442
+ (
443
+ str(r.get(c))
444
+ for c in ("quoteId", "quotedMessageId", "replyToMessageId", "reply_to_message_id")
445
+ if c in reply_cols and r.get(c)
446
+ ),
447
+ None,
448
+ )
449
+ content = (r.get("body") or "").strip()
450
+ if not content and message_type == "system":
451
+ content = f"[system_event:{msg_type or 'signal'}]"
452
+
453
+ metadata: Dict[str, Any] = {}
454
+ for c in reply_cols:
455
+ if r.get(c) is not None:
456
+ metadata[c] = r.get(c)
457
+ for c in system_cols:
458
+ if r.get(c) is not None:
459
+ metadata[c] = r.get(c)
460
+
461
+ # Many Signal Desktop builds keep reply context in JSON payload instead of dedicated columns.
462
+ json_payload = None
463
+ for c in json_cols:
464
+ parsed = _safe_json_loads(r.get(c))
465
+ if parsed:
466
+ json_payload = parsed
467
+ break
468
+ if json_payload:
469
+ json_reply_to, json_reply_meta = _extract_reply_from_signal_json(json_payload)
470
+ if reply_to_message_id is None and json_reply_to is not None:
471
+ reply_to_message_id = json_reply_to
472
+ metadata.update(json_reply_meta)
473
+ row_out = {
474
+ "id": f"signal:{msg_id}:{int(sent_at)}",
475
+ "thread_id": str(r.get("conversation_id") or ""),
476
+ "content": content,
477
+ "created_at": sent_at,
478
+ "role": role,
479
+ "sender_id": sender_id,
480
+ "message_type": message_type,
481
+ "event_type": event_type,
482
+ "reply_to_message_id": reply_to_message_id,
483
+ "ROWID": msg_id,
484
+ "sent_at": sent_at,
485
+ }
486
+ if metadata:
487
+ row_out["_metadata"] = metadata
488
+ out.append(row_out)
489
+ return out
490
+ finally:
491
+ conn.close()
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Dict, Optional
6
+
7
+
8
+ class JobState(str, Enum):
9
+ QUEUED = "queued"
10
+ RUNNING = "running"
11
+ PARSING = "parsing"
12
+ RAW_ENRICH = "raw_enrich"
13
+ CANONICALIZE = "canonicalize"
14
+ CANONICAL_ENRICH = "canonical_enrich"
15
+ VECTOR_INDEX = "vector_index"
16
+ COMPLETE = "complete"
17
+ FAILED = "failed"
18
+ RETRYING = "retrying"
19
+
20
+
21
+ class JobEvent(str, Enum):
22
+ START = "start"
23
+ PARSING_STARTED = "parsing_started"
24
+ PARSING_COMPLETED = "parsing_completed"
25
+ RAW_ENRICHED = "raw_enriched"
26
+ CANONICALIZED = "canonicalized"
27
+ CANONICAL_ENRICHED = "canonical_enriched"
28
+ VECTOR_INDEXED = "vector_indexed"
29
+ FAIL = "fail"
30
+ RETRY = "retry"
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class IngestionJob:
35
+ job_id: str
36
+ dataset_id: str
37
+ schema_id: str
38
+ metadata: Dict[str, str] = field(default_factory=dict)
39
+ state: JobState = JobState.QUEUED
40
+ checkpoint_id: Optional[str] = None
41
+
42
+
43
+ class IngestionStateMachine:
44
+ """State transition contract for ingestion jobs."""
45
+
46
+ def transition(self, job: IngestionJob, event: JobEvent) -> JobState:
47
+ raise NotImplementedError
48
+
49
+
50
+ class DefaultStateMachine(IngestionStateMachine):
51
+ _transitions: Dict[JobState, Dict[JobEvent, JobState]] = {
52
+ JobState.QUEUED: {JobEvent.START: JobState.RUNNING},
53
+ JobState.RETRYING: {JobEvent.START: JobState.RUNNING},
54
+ JobState.RUNNING: {JobEvent.PARSING_STARTED: JobState.PARSING},
55
+ JobState.PARSING: {JobEvent.PARSING_COMPLETED: JobState.RAW_ENRICH},
56
+ JobState.RAW_ENRICH: {JobEvent.RAW_ENRICHED: JobState.CANONICALIZE},
57
+ JobState.CANONICALIZE: {JobEvent.CANONICALIZED: JobState.CANONICAL_ENRICH},
58
+ JobState.CANONICAL_ENRICH: {JobEvent.CANONICAL_ENRICHED: JobState.VECTOR_INDEX},
59
+ JobState.VECTOR_INDEX: {JobEvent.VECTOR_INDEXED: JobState.COMPLETE},
60
+ }
61
+
62
+ def transition(self, job: IngestionJob, event: JobEvent) -> JobState:
63
+ if event == JobEvent.FAIL:
64
+ return JobState.FAILED
65
+ if event == JobEvent.RETRY and job.state == JobState.FAILED:
66
+ return JobState.RETRYING
67
+ next_state = self._transitions.get(job.state, {}).get(event)
68
+ if not next_state:
69
+ raise ValueError(f"Invalid transition: {job.state} + {event}")
70
+ return next_state
@@ -0,0 +1 @@
1
+ """Ingestion triggers."""
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from ..state_machine import IngestionJob
7
+ from ...storage.raw.file_store import RawFileStore
8
+ from ...storage.raw.raw_store import RawFile
9
+
10
+
11
+ @dataclass
12
+ class FileTrigger:
13
+ file_store: RawFileStore
14
+
15
+ def create_job(
16
+ self,
17
+ job_id: str,
18
+ dataset_id: str,
19
+ schema_id: str,
20
+ file_path: str,
21
+ file_format: str = "jsonl",
22
+ ) -> IngestionJob:
23
+ raw_file = RawFile(file_path=file_path, metadata={"dataset_id": dataset_id, "schema_id": schema_id})
24
+ self.file_store.write_file(raw_file)
25
+ return IngestionJob(job_id=job_id, dataset_id=dataset_id, schema_id=schema_id, metadata={"file_format": file_format})
26
+
27
+ def create_job_from_bytes(
28
+ self,
29
+ job_id: str,
30
+ dataset_id: str,
31
+ schema_id: str,
32
+ payload: bytes,
33
+ file_format: str = "jsonl",
34
+ ) -> IngestionJob:
35
+ self.file_store.write_bytes(dataset_id, schema_id, payload)
36
+ return IngestionJob(job_id=job_id, dataset_id=dataset_id, schema_id=schema_id, metadata={"file_format": file_format})
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable, List
5
+
6
+ from ..state_machine import IngestionJob
7
+
8
+
9
+ @dataclass
10
+ class SQLiteTrigger:
11
+ """Stub trigger for raw table writes."""
12
+
13
+ def poll(self) -> Iterable[IngestionJob]:
14
+ return []
15
+
16
+ def enqueue_from_records(self, records: List[dict]) -> List[IngestionJob]:
17
+ _ = records
18
+ return []
@@ -0,0 +1 @@
1
+ """Validation utilities for ingestion."""
@@ -0,0 +1,27 @@
1
+ """Validation primitives for ingestion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Optional
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ValidationResult:
11
+ is_valid: bool
12
+ errors: list[str]
13
+ metadata: Dict[str, Any]
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class SchemaDefinition:
18
+ schema_id: str
19
+ version: str
20
+ raw_schema: Dict[str, Any]
21
+
22
+
23
+ class SchemaValidator:
24
+ """Validates raw records against a schema definition."""
25
+
26
+ def validate(self, record: Dict[str, Any], schema: Optional[SchemaDefinition] = None) -> ValidationResult:
27
+ raise NotImplementedError