topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
"""iMessage reader: copy chat.db to temp (or open read-only), query messages since checkpoint.
|
|
2
|
+
|
|
3
|
+
Requires macOS and Full Disk Access for ~/Library/Messages/chat.db.
|
|
4
|
+
Uses chunked copy to support chat.db larger than ~2GB (avoids errno 84 EOVERFLOW from sendfile).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import errno
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
import plistlib
|
|
13
|
+
import re
|
|
14
|
+
import sqlite3
|
|
15
|
+
import tempfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, Iterator, Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("topos.ingestion.sources.imessage_reader")
|
|
20
|
+
|
|
21
|
+
# Mac epoch: seconds between 2001-01-01 and 1970-01-01
|
|
22
|
+
MAC_EPOCH_OFFSET = 978307200
|
|
23
|
+
|
|
24
|
+
DEFAULT_CHAT_DB_PATH = Path.home() / "Library" / "Messages" / "chat.db"
|
|
25
|
+
|
|
26
|
+
# Chunk size for copy (avoids sendfile/stat overflow on files > ~2GB)
|
|
27
|
+
COPY_CHUNK_SIZE = 8 * 1024 * 1024 # 8 MiB
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_chat_db_path() -> Path:
|
|
31
|
+
"""Return path to iMessage chat.db (macOS)."""
|
|
32
|
+
return Path(os.environ.get("IMESSAGE_CHAT_DB", str(DEFAULT_CHAT_DB_PATH)))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def mac_epoch_to_unix(mac_date: Optional[int]) -> Optional[float]:
|
|
36
|
+
"""Convert iMessage date to Unix timestamp.
|
|
37
|
+
|
|
38
|
+
Apple message.date can be stored in seconds, milliseconds, microseconds, or
|
|
39
|
+
nanoseconds since 2001-01-01 depending on OS/version/export path. Normalize
|
|
40
|
+
to seconds before adding MAC epoch offset.
|
|
41
|
+
"""
|
|
42
|
+
if mac_date is None:
|
|
43
|
+
return None
|
|
44
|
+
value = float(mac_date)
|
|
45
|
+
abs_value = abs(value)
|
|
46
|
+
# Heuristics by magnitude:
|
|
47
|
+
# - seconds since 2001: ~1e9
|
|
48
|
+
# - milliseconds: ~1e12
|
|
49
|
+
# - microseconds: ~1e15
|
|
50
|
+
# - nanoseconds: ~1e18
|
|
51
|
+
if abs_value >= 1e17:
|
|
52
|
+
value = value / 1_000_000_000.0
|
|
53
|
+
elif abs_value >= 1e14:
|
|
54
|
+
value = value / 1_000_000.0
|
|
55
|
+
elif abs_value >= 1e11:
|
|
56
|
+
value = value / 1_000.0
|
|
57
|
+
return value + MAC_EPOCH_OFFSET
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _copy_large_file(src: Path, dst: str, show_progress: bool = True) -> None:
|
|
61
|
+
"""Copy file in chunks using os.open/os.read/os.write only, to avoid EOVERFLOW (errno 84) on any system.
|
|
62
|
+
Optional progress bar when size is available (stat may raise 84 on large files; we catch and skip bar).
|
|
63
|
+
"""
|
|
64
|
+
total_size: Optional[int] = None
|
|
65
|
+
if show_progress:
|
|
66
|
+
try:
|
|
67
|
+
total_size = src.stat().st_size
|
|
68
|
+
except OSError as e:
|
|
69
|
+
if getattr(e, "errno", None) == errno.EOVERFLOW:
|
|
70
|
+
logger.debug("chat.db size overflow (EOVERFLOW), copying without progress bar")
|
|
71
|
+
total_size = None
|
|
72
|
+
|
|
73
|
+
pbar = None
|
|
74
|
+
if show_progress and total_size is not None and total_size > 0:
|
|
75
|
+
from topos.enrichment.progress_bar import ProgressBar
|
|
76
|
+
pbar = ProgressBar(total=total_size, desc="Copying chat.db", width=40)
|
|
77
|
+
pbar.__enter__()
|
|
78
|
+
|
|
79
|
+
fd_in = fd_out = None
|
|
80
|
+
try:
|
|
81
|
+
try:
|
|
82
|
+
fd_in = os.open(str(src), os.O_RDONLY)
|
|
83
|
+
except OSError as e:
|
|
84
|
+
if getattr(e, "errno", None) == errno.EOVERFLOW:
|
|
85
|
+
logger.warning(
|
|
86
|
+
"EOVERFLOW opening source chat.db (file may be too large for this system): path=%s",
|
|
87
|
+
src,
|
|
88
|
+
)
|
|
89
|
+
raise
|
|
90
|
+
fd_out = os.open(dst, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
|
|
91
|
+
while True:
|
|
92
|
+
chunk = os.read(fd_in, COPY_CHUNK_SIZE)
|
|
93
|
+
if not chunk:
|
|
94
|
+
break
|
|
95
|
+
os.write(fd_out, chunk)
|
|
96
|
+
if pbar is not None:
|
|
97
|
+
pbar.update(len(chunk))
|
|
98
|
+
finally:
|
|
99
|
+
if fd_in is not None:
|
|
100
|
+
try:
|
|
101
|
+
os.close(fd_in)
|
|
102
|
+
except OSError:
|
|
103
|
+
pass
|
|
104
|
+
if fd_out is not None:
|
|
105
|
+
try:
|
|
106
|
+
os.close(fd_out)
|
|
107
|
+
except OSError:
|
|
108
|
+
pass
|
|
109
|
+
if pbar is not None:
|
|
110
|
+
pbar.close()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _normalize_sender_id(value: Any) -> Optional[str]:
|
|
114
|
+
"""Normalize sender identity from handle.id for storage."""
|
|
115
|
+
if value is None:
|
|
116
|
+
return None
|
|
117
|
+
text = str(value).strip()
|
|
118
|
+
return text or None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _extract_text_from_plist(obj: Any) -> list[str]:
|
|
122
|
+
"""Recursively pull likely text values from parsed plist structures."""
|
|
123
|
+
out: list[str] = []
|
|
124
|
+
if isinstance(obj, str):
|
|
125
|
+
s = " ".join(obj.split()).strip()
|
|
126
|
+
if s and any(ch.isalpha() for ch in s):
|
|
127
|
+
out.append(s)
|
|
128
|
+
elif isinstance(obj, dict):
|
|
129
|
+
for k, v in obj.items():
|
|
130
|
+
# Skip obviously structural keys.
|
|
131
|
+
if isinstance(k, str) and k in {
|
|
132
|
+
"$archiver", "$version", "$objects", "$top", "$class",
|
|
133
|
+
"NS.keys", "NS.objects",
|
|
134
|
+
}:
|
|
135
|
+
continue
|
|
136
|
+
out.extend(_extract_text_from_plist(v))
|
|
137
|
+
elif isinstance(obj, (list, tuple, set)):
|
|
138
|
+
for item in obj:
|
|
139
|
+
out.extend(_extract_text_from_plist(item))
|
|
140
|
+
return out
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _looks_like_archive_noise(s: str) -> bool:
|
|
144
|
+
low = s.lower()
|
|
145
|
+
return (
|
|
146
|
+
low.startswith("ns.")
|
|
147
|
+
or "nskeyedarchiver" in low
|
|
148
|
+
or "nsdictionary" in low
|
|
149
|
+
or "nsmutablestring" in low
|
|
150
|
+
or "nsnumber" in low
|
|
151
|
+
or "attribute" in low
|
|
152
|
+
or low in {"bplist00", "$objects", "$top", "$version", "$archiver"}
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _extract_utf8_text_candidates(raw: bytes) -> list[str]:
|
|
157
|
+
"""Extract likely human text from UTF-8 byte payloads only.
|
|
158
|
+
|
|
159
|
+
We intentionally avoid utf-16/latin blind decoding to prevent fake CJK
|
|
160
|
+
gibberish from archive bytes interpreted with wrong encodings.
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
decoded = raw.decode("utf-8", errors="ignore")
|
|
164
|
+
except Exception:
|
|
165
|
+
return []
|
|
166
|
+
candidates: list[str] = []
|
|
167
|
+
for match in re.findall(r"[^\x00-\x1F]{4,}", decoded):
|
|
168
|
+
s = " ".join(match.split()).strip()
|
|
169
|
+
if not s:
|
|
170
|
+
continue
|
|
171
|
+
if _looks_like_archive_noise(s):
|
|
172
|
+
continue
|
|
173
|
+
# Keep likely natural-language strings; avoid purely symbolic fragments.
|
|
174
|
+
alpha = sum(1 for ch in s if ch.isalpha())
|
|
175
|
+
if alpha < 3:
|
|
176
|
+
continue
|
|
177
|
+
candidates.append(s)
|
|
178
|
+
return candidates
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _extract_text_from_attributed_body(value: Any) -> Optional[str]:
|
|
182
|
+
"""Best-effort extraction of human text from iMessage attributedBody blobs.
|
|
183
|
+
|
|
184
|
+
Important: do NOT decode arbitrary bytes as plain text. That produces
|
|
185
|
+
garbage strings (often CJK-looking) when archive bytes are interpreted with
|
|
186
|
+
the wrong encoding.
|
|
187
|
+
"""
|
|
188
|
+
if value is None:
|
|
189
|
+
return None
|
|
190
|
+
if isinstance(value, str):
|
|
191
|
+
s = " ".join(value.split()).strip()
|
|
192
|
+
return s or None
|
|
193
|
+
if not isinstance(value, (bytes, bytearray, memoryview)):
|
|
194
|
+
return None
|
|
195
|
+
raw = bytes(value)
|
|
196
|
+
candidates: list[str] = []
|
|
197
|
+
|
|
198
|
+
# Many modern iMessage attributedBody fields are keyed archive plists.
|
|
199
|
+
if raw.startswith(b"bplist00"):
|
|
200
|
+
try:
|
|
201
|
+
plist_obj = plistlib.loads(raw)
|
|
202
|
+
candidates.extend(_extract_text_from_plist(plist_obj))
|
|
203
|
+
except Exception:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Some attributedBody payloads are not bplist but still contain UTF-8 text.
|
|
207
|
+
if not candidates:
|
|
208
|
+
candidates.extend(_extract_utf8_text_candidates(raw))
|
|
209
|
+
if not candidates:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# Filter out noisy/internal archive strings and pick best candidate.
|
|
213
|
+
filtered = [s for s in candidates if not _looks_like_archive_noise(s)]
|
|
214
|
+
if not filtered:
|
|
215
|
+
return None
|
|
216
|
+
best = max(filtered, key=len)
|
|
217
|
+
best = best.replace("\ufffc", "").strip()
|
|
218
|
+
return best or None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _build_content_from_row(row: Dict[str, Any]) -> Optional[str]:
|
|
222
|
+
"""Build content string for iMessage rows, including non-text message forms."""
|
|
223
|
+
text = (row.get("text") or "").strip()
|
|
224
|
+
if text:
|
|
225
|
+
return text
|
|
226
|
+
|
|
227
|
+
attributed_text = _extract_text_from_attributed_body(row.get("attributed_body"))
|
|
228
|
+
if attributed_text:
|
|
229
|
+
return attributed_text
|
|
230
|
+
|
|
231
|
+
subject = (row.get("subject") or "").strip()
|
|
232
|
+
if subject:
|
|
233
|
+
return subject
|
|
234
|
+
|
|
235
|
+
# Handle tapbacks / reaction-style records where text is empty.
|
|
236
|
+
associated_guid = row.get("associated_message_guid")
|
|
237
|
+
associated_type = row.get("associated_message_type")
|
|
238
|
+
if associated_guid:
|
|
239
|
+
return f"[reaction:{associated_type}]"
|
|
240
|
+
|
|
241
|
+
if row.get("cache_has_attachments"):
|
|
242
|
+
return "[attachment]"
|
|
243
|
+
|
|
244
|
+
item_type = row.get("item_type")
|
|
245
|
+
if item_type not in (None, 0, "0"):
|
|
246
|
+
return f"[system_event:item_type={item_type}]"
|
|
247
|
+
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _extract_imessage_context(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
252
|
+
"""Extract unified reply/system context for canonical mapping."""
|
|
253
|
+
metadata: Dict[str, Any] = {}
|
|
254
|
+
reply_to_message_id: Optional[str] = None
|
|
255
|
+
message_type = "message"
|
|
256
|
+
event_type: Optional[str] = None
|
|
257
|
+
|
|
258
|
+
thread_originator_guid = row.get("thread_originator_guid")
|
|
259
|
+
if thread_originator_guid:
|
|
260
|
+
reply_to_message_id = str(thread_originator_guid)
|
|
261
|
+
metadata["thread_originator_guid"] = str(thread_originator_guid)
|
|
262
|
+
if row.get("thread_originator_part") is not None:
|
|
263
|
+
metadata["thread_originator_part"] = row.get("thread_originator_part")
|
|
264
|
+
|
|
265
|
+
associated_guid = row.get("associated_message_guid")
|
|
266
|
+
associated_type = row.get("associated_message_type")
|
|
267
|
+
if associated_guid:
|
|
268
|
+
metadata["associated_message_guid"] = str(associated_guid)
|
|
269
|
+
if associated_type is not None:
|
|
270
|
+
metadata["associated_message_type"] = associated_type
|
|
271
|
+
|
|
272
|
+
item_type = row.get("item_type")
|
|
273
|
+
if item_type not in (None, 0, "0"):
|
|
274
|
+
message_type = "system"
|
|
275
|
+
event_type = f"imessage_item_type:{item_type}"
|
|
276
|
+
metadata["item_type"] = item_type
|
|
277
|
+
|
|
278
|
+
group_action_type = row.get("group_action_type")
|
|
279
|
+
if group_action_type not in (None, 0, "0"):
|
|
280
|
+
message_type = "system"
|
|
281
|
+
event_type = f"imessage_group_action:{group_action_type}"
|
|
282
|
+
metadata["group_action_type"] = group_action_type
|
|
283
|
+
|
|
284
|
+
if row.get("message_guid"):
|
|
285
|
+
metadata["message_guid"] = str(row.get("message_guid"))
|
|
286
|
+
if row.get("chat_guid"):
|
|
287
|
+
metadata["chat_guid"] = str(row.get("chat_guid"))
|
|
288
|
+
if row.get("chat_identifier"):
|
|
289
|
+
metadata["chat_identifier"] = str(row.get("chat_identifier"))
|
|
290
|
+
|
|
291
|
+
result: Dict[str, Any] = {
|
|
292
|
+
"message_type": message_type,
|
|
293
|
+
"event_type": event_type,
|
|
294
|
+
}
|
|
295
|
+
if reply_to_message_id:
|
|
296
|
+
result["reply_to_message_id"] = reply_to_message_id
|
|
297
|
+
if metadata:
|
|
298
|
+
result["_metadata"] = metadata
|
|
299
|
+
return result
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def read_imessage_rows(
|
|
303
|
+
last_rowid: Optional[str] = None,
|
|
304
|
+
chat_db_path: Optional[Path] = None,
|
|
305
|
+
batch_size: int = 5000,
|
|
306
|
+
start_unix: Optional[float] = None,
|
|
307
|
+
) -> Iterator[Dict[str, Any]]:
|
|
308
|
+
"""
|
|
309
|
+
Copy chat.db to a temp file (chunked to support >2GB), query messages with ROWID > last_rowid, yield rows as dicts.
|
|
310
|
+
Each row has: id (imessage:ROWID), thread_id (str chat_id), content (text), created_at (Unix ts), role (user/other from is_from_me).
|
|
311
|
+
"""
|
|
312
|
+
path = chat_db_path or get_chat_db_path()
|
|
313
|
+
if not path.exists():
|
|
314
|
+
raise FileNotFoundError(f"chat.db not found at {path}; Full Disk Access may be required")
|
|
315
|
+
copy_path = None
|
|
316
|
+
try:
|
|
317
|
+
fd, copy_path = tempfile.mkstemp(suffix=".db", prefix="topos_imessage_")
|
|
318
|
+
os.close(fd)
|
|
319
|
+
try:
|
|
320
|
+
_copy_large_file(path, copy_path)
|
|
321
|
+
except OSError as e:
|
|
322
|
+
if getattr(e, "errno", None) == errno.EOVERFLOW:
|
|
323
|
+
try:
|
|
324
|
+
_copy_large_file(path, copy_path, show_progress=False)
|
|
325
|
+
except (OSError, PermissionError) as retry_e:
|
|
326
|
+
raise PermissionError(f"Cannot copy chat.db: {retry_e}. Full Disk Access may be required.") from retry_e
|
|
327
|
+
else:
|
|
328
|
+
raise PermissionError(f"Cannot copy chat.db: {e}. Full Disk Access may be required.") from e
|
|
329
|
+
except PermissionError as e:
|
|
330
|
+
raise
|
|
331
|
+
except Exception:
|
|
332
|
+
if copy_path and os.path.exists(copy_path):
|
|
333
|
+
try:
|
|
334
|
+
os.unlink(copy_path)
|
|
335
|
+
except OSError:
|
|
336
|
+
pass
|
|
337
|
+
raise
|
|
338
|
+
try:
|
|
339
|
+
try:
|
|
340
|
+
conn = sqlite3.connect(copy_path)
|
|
341
|
+
except OSError as e:
|
|
342
|
+
if getattr(e, "errno", None) == errno.EOVERFLOW:
|
|
343
|
+
logger.warning(
|
|
344
|
+
"EOVERFLOW opening copied chat.db with SQLite (copied file may be too large): %s",
|
|
345
|
+
copy_path,
|
|
346
|
+
)
|
|
347
|
+
raise
|
|
348
|
+
conn.row_factory = sqlite3.Row
|
|
349
|
+
try:
|
|
350
|
+
message_columns = {
|
|
351
|
+
str(r["name"])
|
|
352
|
+
for r in conn.execute("PRAGMA table_info(message)").fetchall()
|
|
353
|
+
if r["name"]
|
|
354
|
+
}
|
|
355
|
+
chat_columns = {
|
|
356
|
+
str(r["name"])
|
|
357
|
+
for r in conn.execute("PRAGMA table_info(chat)").fetchall()
|
|
358
|
+
if r["name"]
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
def _message_col_or_null(column: str, alias: str) -> str:
|
|
362
|
+
if column in message_columns:
|
|
363
|
+
return f"message.{column} AS {alias}"
|
|
364
|
+
return f"NULL AS {alias}"
|
|
365
|
+
|
|
366
|
+
def _chat_col_or_null(column: str, alias: str) -> str:
|
|
367
|
+
if column in chat_columns:
|
|
368
|
+
return f"chat.{column} AS {alias}"
|
|
369
|
+
return f"NULL AS {alias}"
|
|
370
|
+
|
|
371
|
+
last = 0
|
|
372
|
+
if last_rowid:
|
|
373
|
+
# last_record_id may be "imessage:12345" or "12345"
|
|
374
|
+
raw = last_rowid.split(":")[-1]
|
|
375
|
+
try:
|
|
376
|
+
last = int(raw)
|
|
377
|
+
except ValueError:
|
|
378
|
+
pass
|
|
379
|
+
mac_start_seconds = None
|
|
380
|
+
if start_unix is not None:
|
|
381
|
+
mac_start_seconds = float(start_unix) - MAC_EPOCH_OFFSET
|
|
382
|
+
# Include non-text message forms too; content is synthesized when text is absent.
|
|
383
|
+
query = f"""
|
|
384
|
+
SELECT message.ROWID AS rowid,
|
|
385
|
+
message.text AS text,
|
|
386
|
+
message.subject AS subject,
|
|
387
|
+
message.attributedBody AS attributed_body,
|
|
388
|
+
message.associated_message_guid AS associated_message_guid,
|
|
389
|
+
message.associated_message_type AS associated_message_type,
|
|
390
|
+
message.cache_has_attachments AS cache_has_attachments,
|
|
391
|
+
message.item_type AS item_type,
|
|
392
|
+
{_message_col_or_null("group_action_type", "group_action_type")},
|
|
393
|
+
{_message_col_or_null("thread_originator_guid", "thread_originator_guid")},
|
|
394
|
+
{_message_col_or_null("thread_originator_part", "thread_originator_part")},
|
|
395
|
+
{_message_col_or_null("guid", "message_guid")},
|
|
396
|
+
message.date AS date,
|
|
397
|
+
message.handle_id AS handle_id,
|
|
398
|
+
message.is_from_me AS is_from_me,
|
|
399
|
+
handle.id AS sender_id,
|
|
400
|
+
chat.ROWID AS chat_id,
|
|
401
|
+
{_chat_col_or_null("guid", "chat_guid")},
|
|
402
|
+
{_chat_col_or_null("chat_identifier", "chat_identifier")}
|
|
403
|
+
FROM message
|
|
404
|
+
JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
|
|
405
|
+
JOIN chat ON chat.ROWID = chat_message_join.chat_id
|
|
406
|
+
LEFT JOIN handle ON handle.ROWID = message.handle_id
|
|
407
|
+
WHERE message.ROWID > ?
|
|
408
|
+
AND (
|
|
409
|
+
? IS NULL
|
|
410
|
+
OR (
|
|
411
|
+
CASE
|
|
412
|
+
WHEN abs(message.date) >= 100000000000000000 THEN (message.date / 1000000000.0)
|
|
413
|
+
WHEN abs(message.date) >= 100000000000000 THEN (message.date / 1000000.0)
|
|
414
|
+
WHEN abs(message.date) >= 100000000000 THEN (message.date / 1000.0)
|
|
415
|
+
ELSE (message.date * 1.0)
|
|
416
|
+
END
|
|
417
|
+
) >= ?
|
|
418
|
+
)
|
|
419
|
+
ORDER BY message.ROWID
|
|
420
|
+
LIMIT ?
|
|
421
|
+
"""
|
|
422
|
+
cursor = conn.execute(query, (last, mac_start_seconds, mac_start_seconds, batch_size))
|
|
423
|
+
for row in cursor:
|
|
424
|
+
r = dict(row)
|
|
425
|
+
rowid = r["rowid"]
|
|
426
|
+
content = _build_content_from_row(r)
|
|
427
|
+
if not content:
|
|
428
|
+
continue
|
|
429
|
+
mac_date = r.get("date")
|
|
430
|
+
unix_ts = mac_epoch_to_unix(mac_date) if mac_date is not None else None
|
|
431
|
+
is_from_me = r.get("is_from_me", 0)
|
|
432
|
+
role = "user" if is_from_me else "other"
|
|
433
|
+
context = _extract_imessage_context(r)
|
|
434
|
+
if is_from_me:
|
|
435
|
+
sender_id = "self"
|
|
436
|
+
else:
|
|
437
|
+
sender_id = _normalize_sender_id(r.get("sender_id")) or f"unknown:{r.get('handle_id')}"
|
|
438
|
+
out = {
|
|
439
|
+
"id": f"imessage:{rowid}",
|
|
440
|
+
"thread_id": str(r.get("chat_id", "")),
|
|
441
|
+
"content": content,
|
|
442
|
+
"created_at": unix_ts,
|
|
443
|
+
"role": role,
|
|
444
|
+
"sender_id": sender_id,
|
|
445
|
+
"ROWID": rowid,
|
|
446
|
+
}
|
|
447
|
+
if context.get("reply_to_message_id"):
|
|
448
|
+
out["reply_to_message_id"] = context["reply_to_message_id"]
|
|
449
|
+
if context.get("message_type"):
|
|
450
|
+
out["message_type"] = context["message_type"]
|
|
451
|
+
if context.get("event_type"):
|
|
452
|
+
out["event_type"] = context["event_type"]
|
|
453
|
+
if context.get("_metadata"):
|
|
454
|
+
out["_metadata"] = context["_metadata"]
|
|
455
|
+
yield out
|
|
456
|
+
finally:
|
|
457
|
+
conn.close()
|
|
458
|
+
finally:
|
|
459
|
+
try:
|
|
460
|
+
os.unlink(copy_path)
|
|
461
|
+
except OSError:
|
|
462
|
+
pass
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def read_imessage_rows_list(
|
|
466
|
+
last_rowid: Optional[str] = None,
|
|
467
|
+
chat_db_path: Optional[Path] = None,
|
|
468
|
+
batch_size: int = 5000,
|
|
469
|
+
start_unix: Optional[float] = None,
|
|
470
|
+
) -> list[Dict[str, Any]]:
|
|
471
|
+
"""Convenience: consume iterator into a list."""
|
|
472
|
+
return list(
|
|
473
|
+
read_imessage_rows(
|
|
474
|
+
last_rowid=last_rowid,
|
|
475
|
+
chat_db_path=chat_db_path,
|
|
476
|
+
batch_size=batch_size,
|
|
477
|
+
start_unix=start_unix,
|
|
478
|
+
)
|
|
479
|
+
)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Parse Signal export files (JSON) into normalized records for conversation_messages.
|
|
2
|
+
|
|
3
|
+
Supported format: JSON array of message objects. Each object may have:
|
|
4
|
+
- conversationId or conversation_id
|
|
5
|
+
- body or content
|
|
6
|
+
- sent_at (ms or sec) or created_at
|
|
7
|
+
- type: "outgoing" | "incoming" (for from_self)
|
|
8
|
+
- source or sender (phone number for identity matching)
|
|
9
|
+
|
|
10
|
+
message_id is stable: signal_import:{conversation_id}:{sent_at}:{content_hash} for idempotent re-upload.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Any, Dict, List, Optional
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("topos.ingestion.sources.signal_export_parser")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _norm_ts(sent_at: Any) -> str:
|
|
25
|
+
"""Normalize sent_at (ms or sec) to ISO ts string."""
|
|
26
|
+
if sent_at is None:
|
|
27
|
+
return datetime.now(timezone.utc).isoformat()
|
|
28
|
+
if isinstance(sent_at, str):
|
|
29
|
+
return sent_at
|
|
30
|
+
if isinstance(sent_at, (int, float)):
|
|
31
|
+
if sent_at > 1e12: # milliseconds
|
|
32
|
+
sent_at = sent_at / 1000.0
|
|
33
|
+
return datetime.fromtimestamp(sent_at, tz=timezone.utc).isoformat()
|
|
34
|
+
return str(sent_at)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _stable_message_id(conversation_id: str, sent_at: Any, content: str) -> str:
|
|
38
|
+
"""Stable id for idempotent upsert."""
|
|
39
|
+
raw = f"{conversation_id}:{sent_at}:{content}"
|
|
40
|
+
h = hashlib.sha256(raw.encode("utf-8", errors="replace")).hexdigest()[:12]
|
|
41
|
+
ts = int(sent_at) if isinstance(sent_at, (int, float)) else 0
|
|
42
|
+
if ts > 1e12:
|
|
43
|
+
ts = int(ts / 1000)
|
|
44
|
+
return f"signal_import:{conversation_id}:{ts}:{h}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_signal_export_json(
|
|
48
|
+
data: bytes | str,
|
|
49
|
+
*,
|
|
50
|
+
my_phone_number: Optional[str] = None,
|
|
51
|
+
owner_user_id: Optional[str] = None,
|
|
52
|
+
) -> List[Dict[str, Any]]:
|
|
53
|
+
"""
|
|
54
|
+
Parse JSON export (array of message objects) into staging records for ConversationsTablesManager.
|
|
55
|
+
Each record has: message_id, conversation_id, ts, sender_type (self|contact), content, source_id=signal,
|
|
56
|
+
from_self, owner_user_id (if identity provided).
|
|
57
|
+
"""
|
|
58
|
+
if isinstance(data, bytes):
|
|
59
|
+
data = data.decode("utf-8", errors="replace")
|
|
60
|
+
try:
|
|
61
|
+
arr = json.loads(data)
|
|
62
|
+
except json.JSONDecodeError as e:
|
|
63
|
+
raise ValueError(f"Invalid JSON: {e}") from e
|
|
64
|
+
if not isinstance(arr, list):
|
|
65
|
+
arr = [arr]
|
|
66
|
+
records: List[Dict[str, Any]] = []
|
|
67
|
+
for i, obj in enumerate(arr):
|
|
68
|
+
if not isinstance(obj, dict):
|
|
69
|
+
continue
|
|
70
|
+
conv_id = str(obj.get("conversationId") or obj.get("conversation_id") or f"conv_{i}")
|
|
71
|
+
body = obj.get("body") or obj.get("content") or ""
|
|
72
|
+
sent_at = obj.get("sent_at") or obj.get("created_at") or obj.get("date")
|
|
73
|
+
msg_type = (obj.get("type") or "").lower()
|
|
74
|
+
source_phone = obj.get("source") or obj.get("sender") or obj.get("sender_phone")
|
|
75
|
+
if isinstance(source_phone, dict):
|
|
76
|
+
source_phone = source_phone.get("number") or source_phone.get("phone")
|
|
77
|
+
source_phone = str(source_phone).strip() if source_phone else None
|
|
78
|
+
|
|
79
|
+
from_self = msg_type == "outgoing"
|
|
80
|
+
if my_phone_number and source_phone:
|
|
81
|
+
norm_phone = my_phone_number.replace(" ", "").replace("-", "").strip()
|
|
82
|
+
norm_source = (source_phone or "").replace(" ", "").replace("-", "").strip()
|
|
83
|
+
if norm_phone and norm_source and norm_phone in norm_source or norm_source in norm_phone:
|
|
84
|
+
from_self = True
|
|
85
|
+
elif msg_type == "outgoing":
|
|
86
|
+
from_self = True
|
|
87
|
+
sender_type = "self" if from_self else "contact"
|
|
88
|
+
message_type = "system" if msg_type and msg_type not in {"outgoing", "incoming"} else "message"
|
|
89
|
+
event_type = f"signal_type:{msg_type}" if message_type == "system" else None
|
|
90
|
+
reply_to_message_id = (
|
|
91
|
+
obj.get("quoteId")
|
|
92
|
+
or obj.get("quotedMessageId")
|
|
93
|
+
or obj.get("replyToMessageId")
|
|
94
|
+
or obj.get("reply_to_message_id")
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
message_id = _stable_message_id(conv_id, sent_at, body)
|
|
98
|
+
ts = _norm_ts(sent_at)
|
|
99
|
+
content = body
|
|
100
|
+
if not content and message_type == "system":
|
|
101
|
+
content = f"[system_event:{msg_type}]"
|
|
102
|
+
rec = {
|
|
103
|
+
"message_id": message_id,
|
|
104
|
+
"conversation_id": conv_id,
|
|
105
|
+
"thread_id": conv_id,
|
|
106
|
+
"ts": ts,
|
|
107
|
+
"sender_type": sender_type,
|
|
108
|
+
"content": content,
|
|
109
|
+
"source_id": "signal",
|
|
110
|
+
"from_self": from_self,
|
|
111
|
+
"sender_id": source_phone,
|
|
112
|
+
"message_type": message_type,
|
|
113
|
+
"event_type": event_type,
|
|
114
|
+
}
|
|
115
|
+
if reply_to_message_id is not None:
|
|
116
|
+
rec["reply_to_message_id"] = str(reply_to_message_id)
|
|
117
|
+
metadata = {}
|
|
118
|
+
for key in (
|
|
119
|
+
"quoteId", "quotedMessageId", "replyToMessageId", "reply_to_message_id",
|
|
120
|
+
"quoteAuthorAci", "quoteAuthorUuid", "quoteAuthor", "quoteText", "quoteBody",
|
|
121
|
+
"storyReplyContext", "groupV2Change", "groupUpdate", "groupChange",
|
|
122
|
+
"callId", "callHistoryDetails", "expiresTimer", "expirationStartTimestamp",
|
|
123
|
+
"isErased", "isViewOnce", "isStory",
|
|
124
|
+
):
|
|
125
|
+
if key in obj and obj.get(key) is not None:
|
|
126
|
+
metadata[key] = obj.get(key)
|
|
127
|
+
if metadata:
|
|
128
|
+
rec["_metadata"] = metadata
|
|
129
|
+
if owner_user_id:
|
|
130
|
+
rec["owner_user_id"] = owner_user_id
|
|
131
|
+
records.append(rec)
|
|
132
|
+
return records
|