topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
"""Signal Desktop DB reader: open SQLCipher DB, query messages since checkpoint.
|
|
2
|
+
|
|
3
|
+
Requires pysqlcipher3 (pip install pysqlcipher3). Key from ~/Library/Application Support/Signal/config.json.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import base64
|
|
9
|
+
import importlib
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("topos.ingestion.sources.signal_reader")
|
|
20
|
+
|
|
21
|
+
DEFAULT_SIGNAL_DIR = Path.home() / "Library" / "Application Support" / "Signal"
|
|
22
|
+
DEFAULT_DB_PATH = DEFAULT_SIGNAL_DIR / "sql" / "db.sqlite"
|
|
23
|
+
DEFAULT_CONFIG_PATH = DEFAULT_SIGNAL_DIR / "config.json"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_signal_paths() -> tuple[Path, Path]:
|
|
27
|
+
"""Return (config_path, db_path). Override with env SIGNAL_CONFIG_PATH, SIGNAL_DB_PATH."""
|
|
28
|
+
config = Path(os.environ.get("SIGNAL_CONFIG_PATH", str(DEFAULT_CONFIG_PATH)))
|
|
29
|
+
db = Path(os.environ.get("SIGNAL_DB_PATH", str(DEFAULT_DB_PATH)))
|
|
30
|
+
return config, db
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _normalize_hex_key(value: Any) -> Optional[str]:
|
|
34
|
+
if not isinstance(value, str):
|
|
35
|
+
return None
|
|
36
|
+
stripped = value.strip()
|
|
37
|
+
if stripped.startswith("0x"):
|
|
38
|
+
stripped = stripped[2:]
|
|
39
|
+
if stripped.startswith("x'") and stripped.endswith("'") and len(stripped) >= 4:
|
|
40
|
+
stripped = stripped[2:-1]
|
|
41
|
+
if re.fullmatch(r"[0-9a-fA-F]+", stripped):
|
|
42
|
+
return stripped
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_macos_safe_storage_password() -> Optional[str]:
|
|
47
|
+
"""Best-effort retrieval of Signal Safe Storage password from Keychain."""
|
|
48
|
+
if sys.platform != "darwin":
|
|
49
|
+
return None
|
|
50
|
+
services = [
|
|
51
|
+
"Signal Safe Storage",
|
|
52
|
+
"Signal",
|
|
53
|
+
]
|
|
54
|
+
for service in services:
|
|
55
|
+
try:
|
|
56
|
+
proc = subprocess.run(
|
|
57
|
+
["security", "find-generic-password", "-s", service, "-w"],
|
|
58
|
+
capture_output=True,
|
|
59
|
+
text=True,
|
|
60
|
+
check=False,
|
|
61
|
+
)
|
|
62
|
+
if proc.returncode == 0 and proc.stdout.strip():
|
|
63
|
+
return proc.stdout.strip()
|
|
64
|
+
except Exception:
|
|
65
|
+
continue
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _decrypt_signal_encrypted_key(encrypted_key: str) -> Optional[str]:
|
|
70
|
+
"""Decrypt Electron safeStorage encryptedKey on macOS.
|
|
71
|
+
|
|
72
|
+
Signal Desktop stores encryptedKey via Electron safeStorage. On macOS, this
|
|
73
|
+
can be decrypted with the "Signal Safe Storage" keychain secret.
|
|
74
|
+
"""
|
|
75
|
+
if sys.platform != "darwin":
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
safe_storage_password = _get_macos_safe_storage_password()
|
|
79
|
+
if not safe_storage_password:
|
|
80
|
+
logger.warning("Signal encryptedKey present but Safe Storage password was not found in Keychain")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
raw: Optional[bytes] = None
|
|
84
|
+
try:
|
|
85
|
+
raw = base64.b64decode(encrypted_key)
|
|
86
|
+
except Exception:
|
|
87
|
+
try:
|
|
88
|
+
raw = bytes.fromhex(encrypted_key)
|
|
89
|
+
except Exception:
|
|
90
|
+
raw = None
|
|
91
|
+
if not raw:
|
|
92
|
+
logger.warning("Signal encryptedKey format is not base64/hex-decodable")
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
if raw.startswith(b"v10"):
|
|
96
|
+
raw = raw[3:]
|
|
97
|
+
if not raw:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
backends_mod = importlib.import_module("cryptography.hazmat.backends")
|
|
102
|
+
primitives_mod = importlib.import_module("cryptography.hazmat.primitives")
|
|
103
|
+
ciphers_mod = importlib.import_module("cryptography.hazmat.primitives.ciphers")
|
|
104
|
+
pbkdf2_mod = importlib.import_module("cryptography.hazmat.primitives.kdf.pbkdf2")
|
|
105
|
+
default_backend = getattr(backends_mod, "default_backend")
|
|
106
|
+
hashes = getattr(primitives_mod, "hashes")
|
|
107
|
+
Cipher = getattr(ciphers_mod, "Cipher")
|
|
108
|
+
algorithms = getattr(ciphers_mod, "algorithms")
|
|
109
|
+
modes = getattr(ciphers_mod, "modes")
|
|
110
|
+
PBKDF2HMAC = getattr(pbkdf2_mod, "PBKDF2HMAC")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.warning("cryptography import failed for Signal encryptedKey decrypt: %s", e)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
# Electron/Chromium OSCrypt compatibility (macOS).
|
|
117
|
+
kdf = PBKDF2HMAC(
|
|
118
|
+
algorithm=hashes.SHA1(),
|
|
119
|
+
length=16,
|
|
120
|
+
salt=b"saltysalt",
|
|
121
|
+
iterations=1003,
|
|
122
|
+
backend=default_backend(),
|
|
123
|
+
)
|
|
124
|
+
aes_key = kdf.derive(safe_storage_password.encode("utf-8"))
|
|
125
|
+
cipher = Cipher(algorithms.AES(aes_key), modes.CBC(b" " * 16), backend=default_backend())
|
|
126
|
+
decryptor = cipher.decryptor()
|
|
127
|
+
plaintext = decryptor.update(raw) + decryptor.finalize()
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning("Signal encryptedKey decryption failed: %s", e)
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
# PKCS#7 unpadding
|
|
133
|
+
if plaintext:
|
|
134
|
+
pad_len = plaintext[-1]
|
|
135
|
+
if 1 <= pad_len <= 16 and plaintext.endswith(bytes([pad_len]) * pad_len):
|
|
136
|
+
plaintext = plaintext[:-pad_len]
|
|
137
|
+
|
|
138
|
+
# First try direct text forms.
|
|
139
|
+
text = plaintext.decode("utf-8", errors="ignore").strip().strip("\x00")
|
|
140
|
+
normalized = _normalize_hex_key(text)
|
|
141
|
+
if normalized:
|
|
142
|
+
return normalized
|
|
143
|
+
|
|
144
|
+
# Some builds may store binary key material; fall back to hex-encoding bytes.
|
|
145
|
+
binary_hex = plaintext.hex()
|
|
146
|
+
if binary_hex:
|
|
147
|
+
return binary_hex
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_signal_key(config_path: Optional[Path] = None, preferred_hex_key: Optional[str] = None) -> Optional[str]:
|
|
152
|
+
"""Read raw SQLCipher key from Signal config.json.
|
|
153
|
+
|
|
154
|
+
Note: `encryptedKey` is not a raw SQLCipher key and cannot be used directly.
|
|
155
|
+
"""
|
|
156
|
+
if isinstance(preferred_hex_key, str) and preferred_hex_key.strip():
|
|
157
|
+
normalized_preferred_key = _normalize_hex_key(preferred_hex_key)
|
|
158
|
+
if normalized_preferred_key:
|
|
159
|
+
return normalized_preferred_key
|
|
160
|
+
logger.warning("Preferred Signal sync key was provided but is not hex-formatted")
|
|
161
|
+
|
|
162
|
+
env_key = os.environ.get("SIGNAL_KEY_HEX") or os.environ.get("SIGNAL_SQLCIPHER_KEY")
|
|
163
|
+
if isinstance(env_key, str) and env_key.strip():
|
|
164
|
+
normalized_env_key = _normalize_hex_key(env_key)
|
|
165
|
+
if normalized_env_key:
|
|
166
|
+
return normalized_env_key
|
|
167
|
+
logger.warning("SIGNAL_KEY_HEX/SIGNAL_SQLCIPHER_KEY is set but is not hex-formatted")
|
|
168
|
+
|
|
169
|
+
config_path = config_path or get_signal_paths()[0]
|
|
170
|
+
if not config_path.exists():
|
|
171
|
+
return None
|
|
172
|
+
try:
|
|
173
|
+
with open(config_path, encoding="utf-8") as f:
|
|
174
|
+
data = json.load(f)
|
|
175
|
+
key = _normalize_hex_key(data.get("key"))
|
|
176
|
+
if key:
|
|
177
|
+
return key
|
|
178
|
+
if isinstance(data.get("key"), str):
|
|
179
|
+
logger.warning("Signal config key exists but is not hex-formatted")
|
|
180
|
+
|
|
181
|
+
encrypted_key = data.get("encryptedKey")
|
|
182
|
+
if isinstance(encrypted_key, str) and encrypted_key.strip():
|
|
183
|
+
decrypted = _decrypt_signal_encrypted_key(encrypted_key.strip())
|
|
184
|
+
if decrypted:
|
|
185
|
+
logger.info("Signal encryptedKey decrypted via Keychain")
|
|
186
|
+
return decrypted
|
|
187
|
+
logger.warning("Signal config has encryptedKey but decryption failed")
|
|
188
|
+
return None
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning("get_signal_key failed: %s", e)
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _normalize_signal_ts_seconds(value: Any) -> Optional[float]:
|
|
195
|
+
"""Normalize Signal timestamp values to Unix seconds."""
|
|
196
|
+
if value is None:
|
|
197
|
+
return None
|
|
198
|
+
try:
|
|
199
|
+
ts = float(value)
|
|
200
|
+
except Exception:
|
|
201
|
+
return None
|
|
202
|
+
abs_ts = abs(ts)
|
|
203
|
+
if abs_ts >= 1e17:
|
|
204
|
+
ts = ts / 1_000_000_000.0
|
|
205
|
+
elif abs_ts >= 1e14:
|
|
206
|
+
ts = ts / 1_000_000.0
|
|
207
|
+
elif abs_ts >= 1e11:
|
|
208
|
+
ts = ts / 1_000.0
|
|
209
|
+
return ts
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _normalize_signal_sender_id(value: Any) -> Optional[str]:
|
|
213
|
+
if value is None:
|
|
214
|
+
return None
|
|
215
|
+
text = str(value).strip()
|
|
216
|
+
return text or None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _safe_json_loads(value: Any) -> Optional[Dict[str, Any]]:
|
|
220
|
+
if value is None:
|
|
221
|
+
return None
|
|
222
|
+
if isinstance(value, dict):
|
|
223
|
+
return value
|
|
224
|
+
if not isinstance(value, str):
|
|
225
|
+
return None
|
|
226
|
+
text = value.strip()
|
|
227
|
+
if not text:
|
|
228
|
+
return None
|
|
229
|
+
try:
|
|
230
|
+
parsed = json.loads(text)
|
|
231
|
+
except Exception:
|
|
232
|
+
return None
|
|
233
|
+
return parsed if isinstance(parsed, dict) else None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _extract_reply_from_signal_json(payload: Dict[str, Any]) -> tuple[Optional[str], Dict[str, Any]]:
|
|
237
|
+
"""Extract reply linkage from Signal JSON payload shape variants."""
|
|
238
|
+
metadata: Dict[str, Any] = {}
|
|
239
|
+
reply_to: Optional[str] = None
|
|
240
|
+
|
|
241
|
+
for key in ("replyToMessageId", "reply_to_message_id", "quotedMessageId", "quoteId"):
|
|
242
|
+
if payload.get(key) is not None:
|
|
243
|
+
reply_to = str(payload.get(key))
|
|
244
|
+
metadata[key] = payload.get(key)
|
|
245
|
+
break
|
|
246
|
+
|
|
247
|
+
quote = payload.get("quote")
|
|
248
|
+
if isinstance(quote, dict):
|
|
249
|
+
metadata["quote"] = quote
|
|
250
|
+
if reply_to is None:
|
|
251
|
+
for key in ("id", "messageId", "message_id", "targetMessageId"):
|
|
252
|
+
if quote.get(key) is not None:
|
|
253
|
+
reply_to = str(quote.get(key))
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
story_ctx = payload.get("storyReplyContext")
|
|
257
|
+
if isinstance(story_ctx, dict):
|
|
258
|
+
metadata["storyReplyContext"] = story_ctx
|
|
259
|
+
if reply_to is None:
|
|
260
|
+
for key in ("messageId", "message_id", "targetMessageId"):
|
|
261
|
+
if story_ctx.get(key) is not None:
|
|
262
|
+
reply_to = str(story_ctx.get(key))
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
return reply_to, metadata
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def read_signal_rows(
|
|
269
|
+
last_record_id: Optional[str] = None,
|
|
270
|
+
config_path: Optional[Path] = None,
|
|
271
|
+
db_path: Optional[Path] = None,
|
|
272
|
+
my_phone_number: Optional[str] = None,
|
|
273
|
+
batch_size: int = 5000,
|
|
274
|
+
start_unix: Optional[float] = None,
|
|
275
|
+
signal_key_hex: Optional[str] = None,
|
|
276
|
+
) -> list[Dict[str, Any]]:
|
|
277
|
+
"""
|
|
278
|
+
Open Signal SQLCipher DB and return message rows since last_record_id.
|
|
279
|
+
Each row: id (signal:{id}), thread_id (conversationId), content (body), created_at (Unix), role (user/other from type), ROWID/id.
|
|
280
|
+
"""
|
|
281
|
+
try:
|
|
282
|
+
from pysqlcipher3 import dbapi2 as sqlcipher
|
|
283
|
+
except ImportError as e:
|
|
284
|
+
raise ImportError(
|
|
285
|
+
"pysqlcipher3 required for Signal sync. Install with: pip install pysqlcipher3"
|
|
286
|
+
) from e
|
|
287
|
+
|
|
288
|
+
config_path, db_path = config_path or get_signal_paths()[0], db_path or get_signal_paths()[1]
|
|
289
|
+
if not db_path.exists():
|
|
290
|
+
raise FileNotFoundError(f"Signal DB not found at {db_path}")
|
|
291
|
+
key = get_signal_key(config_path, preferred_hex_key=signal_key_hex)
|
|
292
|
+
if not key:
|
|
293
|
+
raise ValueError(
|
|
294
|
+
"Signal SQLCipher key unavailable. Could not resolve raw key from config.json "
|
|
295
|
+
"or macOS Keychain. Workaround: set SIGNAL_KEY_HEX to a raw SQLCipher hex key."
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
key_hex_expr = f"x'{key}'"
|
|
299
|
+
|
|
300
|
+
conn = None
|
|
301
|
+
last_open_error: Optional[str] = None
|
|
302
|
+
# Different Signal versions/DBs can require different compatibility modes.
|
|
303
|
+
for compat in (4, 3):
|
|
304
|
+
try:
|
|
305
|
+
candidate = sqlcipher.connect(str(db_path))
|
|
306
|
+
if compat is not None:
|
|
307
|
+
candidate.execute(f"PRAGMA cipher_compatibility = {compat}")
|
|
308
|
+
candidate.execute(f'PRAGMA key = "{key_hex_expr}"')
|
|
309
|
+
candidate.execute("SELECT count(*) FROM sqlite_master")
|
|
310
|
+
conn = candidate
|
|
311
|
+
break
|
|
312
|
+
except Exception as e:
|
|
313
|
+
last_open_error = str(e)
|
|
314
|
+
try:
|
|
315
|
+
candidate.close()
|
|
316
|
+
except Exception:
|
|
317
|
+
pass
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
if conn is None:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"Unable to open Signal DB at {db_path} with available SQLCipher settings: "
|
|
323
|
+
f"{last_open_error or 'unknown error'}"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
conn.row_factory = lambda c, r: dict(zip([col[0] for col in c.description], r))
|
|
327
|
+
try:
|
|
328
|
+
# Signal Desktop schema varies by version; detect available columns first.
|
|
329
|
+
table_info_rows = conn.execute("PRAGMA table_info(messages)").fetchall()
|
|
330
|
+
available_columns = {str(row.get("name") or "") for row in table_info_rows}
|
|
331
|
+
|
|
332
|
+
if "id" not in available_columns or "body" not in available_columns or "sent_at" not in available_columns:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
"Signal messages table is missing required columns (id/body/sent_at). "
|
|
335
|
+
f"Available columns: {sorted(c for c in available_columns if c)}"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
conversation_col = "conversationId" if "conversationId" in available_columns else (
|
|
339
|
+
"conversation_id" if "conversation_id" in available_columns else None
|
|
340
|
+
)
|
|
341
|
+
if conversation_col is None:
|
|
342
|
+
raise ValueError(
|
|
343
|
+
"Signal messages table is missing conversation column (conversationId/conversation_id). "
|
|
344
|
+
f"Available columns: {sorted(c for c in available_columns if c)}"
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
sender_cols = [c for c in ("sourceServiceId", "sourceUuid", "source") if c in available_columns]
|
|
348
|
+
sender_select = ", " + ", ".join(sender_cols) if sender_cols else ""
|
|
349
|
+
reply_cols = [
|
|
350
|
+
c
|
|
351
|
+
for c in (
|
|
352
|
+
"quoteId",
|
|
353
|
+
"quotedMessageId",
|
|
354
|
+
"replyToMessageId",
|
|
355
|
+
"reply_to_message_id",
|
|
356
|
+
"quoteAuthorAci",
|
|
357
|
+
"quoteAuthorUuid",
|
|
358
|
+
"quoteAuthor",
|
|
359
|
+
"quoteText",
|
|
360
|
+
"quoteBody",
|
|
361
|
+
"storyReplyContext",
|
|
362
|
+
)
|
|
363
|
+
if c in available_columns
|
|
364
|
+
]
|
|
365
|
+
reply_select = ", " + ", ".join(reply_cols) if reply_cols else ""
|
|
366
|
+
system_cols = [
|
|
367
|
+
c
|
|
368
|
+
for c in (
|
|
369
|
+
"groupV2Change",
|
|
370
|
+
"groupUpdate",
|
|
371
|
+
"groupChange",
|
|
372
|
+
"callId",
|
|
373
|
+
"callHistoryDetails",
|
|
374
|
+
"expiresTimer",
|
|
375
|
+
"expirationStartTimestamp",
|
|
376
|
+
"isErased",
|
|
377
|
+
"isViewOnce",
|
|
378
|
+
"isStory",
|
|
379
|
+
)
|
|
380
|
+
if c in available_columns
|
|
381
|
+
]
|
|
382
|
+
system_select = ", " + ", ".join(system_cols) if system_cols else ""
|
|
383
|
+
json_cols = [c for c in ("json", "messageJson", "payload_json") if c in available_columns]
|
|
384
|
+
json_select = ", " + ", ".join(json_cols) if json_cols else ""
|
|
385
|
+
|
|
386
|
+
last_ts: float = 0.0
|
|
387
|
+
if last_record_id:
|
|
388
|
+
parts = last_record_id.split(":")
|
|
389
|
+
if len(parts) >= 3 and parts[0] == "signal":
|
|
390
|
+
try:
|
|
391
|
+
last_ts = float(parts[2])
|
|
392
|
+
except ValueError:
|
|
393
|
+
pass
|
|
394
|
+
# Query-side normalization converts sent_at to milliseconds.
|
|
395
|
+
last_ts_ms = float(last_ts) * 1000.0
|
|
396
|
+
|
|
397
|
+
start_ms: Optional[int] = None
|
|
398
|
+
if start_unix is not None:
|
|
399
|
+
try:
|
|
400
|
+
start_ms = int(float(start_unix) * 1000.0)
|
|
401
|
+
except Exception:
|
|
402
|
+
start_ms = None
|
|
403
|
+
|
|
404
|
+
# Signal Desktop: read only columns that exist in this schema variant.
|
|
405
|
+
normalized_sent_at_expr = """
|
|
406
|
+
CASE
|
|
407
|
+
WHEN abs(sent_at) >= 100000000000000000 THEN (sent_at / 1000000.0)
|
|
408
|
+
WHEN abs(sent_at) >= 100000000000000 THEN (sent_at / 1000.0)
|
|
409
|
+
WHEN abs(sent_at) >= 100000000000 THEN (sent_at * 1.0)
|
|
410
|
+
ELSE (sent_at * 1000.0)
|
|
411
|
+
END
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
query = f"""
|
|
415
|
+
SELECT id, body, sent_at, type, {conversation_col} AS conversation_id{sender_select}{reply_select}{system_select}{json_select}
|
|
416
|
+
FROM messages
|
|
417
|
+
WHERE ({normalized_sent_at_expr}) > ?
|
|
418
|
+
AND (
|
|
419
|
+
? IS NULL
|
|
420
|
+
OR ({normalized_sent_at_expr}) >= ?
|
|
421
|
+
)
|
|
422
|
+
ORDER BY sent_at
|
|
423
|
+
LIMIT ?
|
|
424
|
+
"""
|
|
425
|
+
cursor = conn.execute(query, (last_ts_ms, start_ms, start_ms, batch_size))
|
|
426
|
+
rows = cursor.fetchall()
|
|
427
|
+
out = []
|
|
428
|
+
for r in rows:
|
|
429
|
+
msg_id = r.get("id") or r.get("rowid")
|
|
430
|
+
sent_at_seconds = _normalize_signal_ts_seconds(r.get("sent_at"))
|
|
431
|
+
sent_at = sent_at_seconds if sent_at_seconds is not None else 0
|
|
432
|
+
msg_type = (r.get("type") or "").lower()
|
|
433
|
+
role = "user" if msg_type == "outgoing" else "other"
|
|
434
|
+
message_type = "system" if msg_type not in {"outgoing", "incoming"} else "message"
|
|
435
|
+
event_type = f"signal_type:{msg_type}" if message_type == "system" and msg_type else None
|
|
436
|
+
sender_id = _normalize_signal_sender_id(next((r.get(c) for c in sender_cols if r.get(c)), None))
|
|
437
|
+
if role == "user":
|
|
438
|
+
sender_id = "self"
|
|
439
|
+
if not sender_id:
|
|
440
|
+
sender_id = f"unknown:{msg_id}"
|
|
441
|
+
reply_to_message_id = next(
|
|
442
|
+
(
|
|
443
|
+
str(r.get(c))
|
|
444
|
+
for c in ("quoteId", "quotedMessageId", "replyToMessageId", "reply_to_message_id")
|
|
445
|
+
if c in reply_cols and r.get(c)
|
|
446
|
+
),
|
|
447
|
+
None,
|
|
448
|
+
)
|
|
449
|
+
content = (r.get("body") or "").strip()
|
|
450
|
+
if not content and message_type == "system":
|
|
451
|
+
content = f"[system_event:{msg_type or 'signal'}]"
|
|
452
|
+
|
|
453
|
+
metadata: Dict[str, Any] = {}
|
|
454
|
+
for c in reply_cols:
|
|
455
|
+
if r.get(c) is not None:
|
|
456
|
+
metadata[c] = r.get(c)
|
|
457
|
+
for c in system_cols:
|
|
458
|
+
if r.get(c) is not None:
|
|
459
|
+
metadata[c] = r.get(c)
|
|
460
|
+
|
|
461
|
+
# Many Signal Desktop builds keep reply context in JSON payload instead of dedicated columns.
|
|
462
|
+
json_payload = None
|
|
463
|
+
for c in json_cols:
|
|
464
|
+
parsed = _safe_json_loads(r.get(c))
|
|
465
|
+
if parsed:
|
|
466
|
+
json_payload = parsed
|
|
467
|
+
break
|
|
468
|
+
if json_payload:
|
|
469
|
+
json_reply_to, json_reply_meta = _extract_reply_from_signal_json(json_payload)
|
|
470
|
+
if reply_to_message_id is None and json_reply_to is not None:
|
|
471
|
+
reply_to_message_id = json_reply_to
|
|
472
|
+
metadata.update(json_reply_meta)
|
|
473
|
+
row_out = {
|
|
474
|
+
"id": f"signal:{msg_id}:{int(sent_at)}",
|
|
475
|
+
"thread_id": str(r.get("conversation_id") or ""),
|
|
476
|
+
"content": content,
|
|
477
|
+
"created_at": sent_at,
|
|
478
|
+
"role": role,
|
|
479
|
+
"sender_id": sender_id,
|
|
480
|
+
"message_type": message_type,
|
|
481
|
+
"event_type": event_type,
|
|
482
|
+
"reply_to_message_id": reply_to_message_id,
|
|
483
|
+
"ROWID": msg_id,
|
|
484
|
+
"sent_at": sent_at,
|
|
485
|
+
}
|
|
486
|
+
if metadata:
|
|
487
|
+
row_out["_metadata"] = metadata
|
|
488
|
+
out.append(row_out)
|
|
489
|
+
return out
|
|
490
|
+
finally:
|
|
491
|
+
conn.close()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class JobState(str, Enum):
|
|
9
|
+
QUEUED = "queued"
|
|
10
|
+
RUNNING = "running"
|
|
11
|
+
PARSING = "parsing"
|
|
12
|
+
RAW_ENRICH = "raw_enrich"
|
|
13
|
+
CANONICALIZE = "canonicalize"
|
|
14
|
+
CANONICAL_ENRICH = "canonical_enrich"
|
|
15
|
+
VECTOR_INDEX = "vector_index"
|
|
16
|
+
COMPLETE = "complete"
|
|
17
|
+
FAILED = "failed"
|
|
18
|
+
RETRYING = "retrying"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class JobEvent(str, Enum):
|
|
22
|
+
START = "start"
|
|
23
|
+
PARSING_STARTED = "parsing_started"
|
|
24
|
+
PARSING_COMPLETED = "parsing_completed"
|
|
25
|
+
RAW_ENRICHED = "raw_enriched"
|
|
26
|
+
CANONICALIZED = "canonicalized"
|
|
27
|
+
CANONICAL_ENRICHED = "canonical_enriched"
|
|
28
|
+
VECTOR_INDEXED = "vector_indexed"
|
|
29
|
+
FAIL = "fail"
|
|
30
|
+
RETRY = "retry"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class IngestionJob:
|
|
35
|
+
job_id: str
|
|
36
|
+
dataset_id: str
|
|
37
|
+
schema_id: str
|
|
38
|
+
metadata: Dict[str, str] = field(default_factory=dict)
|
|
39
|
+
state: JobState = JobState.QUEUED
|
|
40
|
+
checkpoint_id: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class IngestionStateMachine:
|
|
44
|
+
"""State transition contract for ingestion jobs."""
|
|
45
|
+
|
|
46
|
+
def transition(self, job: IngestionJob, event: JobEvent) -> JobState:
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DefaultStateMachine(IngestionStateMachine):
|
|
51
|
+
_transitions: Dict[JobState, Dict[JobEvent, JobState]] = {
|
|
52
|
+
JobState.QUEUED: {JobEvent.START: JobState.RUNNING},
|
|
53
|
+
JobState.RETRYING: {JobEvent.START: JobState.RUNNING},
|
|
54
|
+
JobState.RUNNING: {JobEvent.PARSING_STARTED: JobState.PARSING},
|
|
55
|
+
JobState.PARSING: {JobEvent.PARSING_COMPLETED: JobState.RAW_ENRICH},
|
|
56
|
+
JobState.RAW_ENRICH: {JobEvent.RAW_ENRICHED: JobState.CANONICALIZE},
|
|
57
|
+
JobState.CANONICALIZE: {JobEvent.CANONICALIZED: JobState.CANONICAL_ENRICH},
|
|
58
|
+
JobState.CANONICAL_ENRICH: {JobEvent.CANONICAL_ENRICHED: JobState.VECTOR_INDEX},
|
|
59
|
+
JobState.VECTOR_INDEX: {JobEvent.VECTOR_INDEXED: JobState.COMPLETE},
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def transition(self, job: IngestionJob, event: JobEvent) -> JobState:
|
|
63
|
+
if event == JobEvent.FAIL:
|
|
64
|
+
return JobState.FAILED
|
|
65
|
+
if event == JobEvent.RETRY and job.state == JobState.FAILED:
|
|
66
|
+
return JobState.RETRYING
|
|
67
|
+
next_state = self._transitions.get(job.state, {}).get(event)
|
|
68
|
+
if not next_state:
|
|
69
|
+
raise ValueError(f"Invalid transition: {job.state} + {event}")
|
|
70
|
+
return next_state
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Ingestion triggers."""
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from ..state_machine import IngestionJob
|
|
7
|
+
from ...storage.raw.file_store import RawFileStore
|
|
8
|
+
from ...storage.raw.raw_store import RawFile
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FileTrigger:
|
|
13
|
+
file_store: RawFileStore
|
|
14
|
+
|
|
15
|
+
def create_job(
|
|
16
|
+
self,
|
|
17
|
+
job_id: str,
|
|
18
|
+
dataset_id: str,
|
|
19
|
+
schema_id: str,
|
|
20
|
+
file_path: str,
|
|
21
|
+
file_format: str = "jsonl",
|
|
22
|
+
) -> IngestionJob:
|
|
23
|
+
raw_file = RawFile(file_path=file_path, metadata={"dataset_id": dataset_id, "schema_id": schema_id})
|
|
24
|
+
self.file_store.write_file(raw_file)
|
|
25
|
+
return IngestionJob(job_id=job_id, dataset_id=dataset_id, schema_id=schema_id, metadata={"file_format": file_format})
|
|
26
|
+
|
|
27
|
+
def create_job_from_bytes(
|
|
28
|
+
self,
|
|
29
|
+
job_id: str,
|
|
30
|
+
dataset_id: str,
|
|
31
|
+
schema_id: str,
|
|
32
|
+
payload: bytes,
|
|
33
|
+
file_format: str = "jsonl",
|
|
34
|
+
) -> IngestionJob:
|
|
35
|
+
self.file_store.write_bytes(dataset_id, schema_id, payload)
|
|
36
|
+
return IngestionJob(job_id=job_id, dataset_id=dataset_id, schema_id=schema_id, metadata={"file_format": file_format})
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Iterable, List
|
|
5
|
+
|
|
6
|
+
from ..state_machine import IngestionJob
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class SQLiteTrigger:
|
|
11
|
+
"""Stub trigger for raw table writes."""
|
|
12
|
+
|
|
13
|
+
def poll(self) -> Iterable[IngestionJob]:
|
|
14
|
+
return []
|
|
15
|
+
|
|
16
|
+
def enqueue_from_records(self, records: List[dict]) -> List[IngestionJob]:
|
|
17
|
+
_ = records
|
|
18
|
+
return []
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Validation utilities for ingestion."""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Validation primitives for ingestion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class ValidationResult:
|
|
11
|
+
is_valid: bool
|
|
12
|
+
errors: list[str]
|
|
13
|
+
metadata: Dict[str, Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class SchemaDefinition:
|
|
18
|
+
schema_id: str
|
|
19
|
+
version: str
|
|
20
|
+
raw_schema: Dict[str, Any]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SchemaValidator:
|
|
24
|
+
"""Validates raw records against a schema definition."""
|
|
25
|
+
|
|
26
|
+
def validate(self, record: Dict[str, Any], schema: Optional[SchemaDefinition] = None) -> ValidationResult:
|
|
27
|
+
raise NotImplementedError
|