topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1020 @@
|
|
|
1
|
+
"""Canonical tables for human-to-human conversations (messenger ingestion).
|
|
2
|
+
|
|
3
|
+
Stores thread metadata in `conversations` and message rows in `conversation_messages`.
|
|
4
|
+
Used when canonical_group_id="conversations" (e.g. iMessage, Signal). Not ai_chat_*.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import hashlib
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("topos.storage.canonical.conversations_tables")
|
|
16
|
+
|
|
17
|
+
CONVERSATIONS_TABLE = "conversations"
|
|
18
|
+
CONVERSATION_MESSAGES_TABLE = "conversation_messages"
|
|
19
|
+
CONTACTS_TABLE = "contacts"
|
|
20
|
+
CONTACT_IDENTIFIERS_TABLE = "contact_identifiers"
|
|
21
|
+
CONVERSATION_PARTICIPANTS_TABLE = "conversation_participants"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _message_timestamp_unix_for_sort(event_at: Optional[str], created_at: Optional[str]) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Parse message time for chronological ordering (larger = more recent).
|
|
27
|
+
|
|
28
|
+
SQLite ``ORDER BY event_at DESC`` on TEXT can mis-order values (mixed ISO shapes,
|
|
29
|
+
epoch strings, empty event_at with valid created_at). Callers fetch a bounded set
|
|
30
|
+
and re-sort in Python using this key.
|
|
31
|
+
"""
|
|
32
|
+
for raw in (event_at, created_at):
|
|
33
|
+
if raw is None:
|
|
34
|
+
continue
|
|
35
|
+
s = str(raw).strip()
|
|
36
|
+
if not s:
|
|
37
|
+
continue
|
|
38
|
+
try:
|
|
39
|
+
iso = s
|
|
40
|
+
if iso.endswith("Z"):
|
|
41
|
+
iso = iso[:-1] + "+00:00"
|
|
42
|
+
dt = datetime.fromisoformat(iso)
|
|
43
|
+
if dt.tzinfo is None:
|
|
44
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
45
|
+
return dt.timestamp()
|
|
46
|
+
except ValueError:
|
|
47
|
+
pass
|
|
48
|
+
try:
|
|
49
|
+
norm = s[:19].replace("T", " ")
|
|
50
|
+
dt = datetime.strptime(norm, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
|
|
51
|
+
return dt.timestamp()
|
|
52
|
+
except ValueError:
|
|
53
|
+
pass
|
|
54
|
+
if s.isdigit() and len(s) >= 10:
|
|
55
|
+
try:
|
|
56
|
+
val = int(s)
|
|
57
|
+
if val > 10_000_000_000:
|
|
58
|
+
val //= 1000
|
|
59
|
+
return float(val)
|
|
60
|
+
except ValueError:
|
|
61
|
+
pass
|
|
62
|
+
return 0.0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def ensure_conversations_table(conn) -> None:
|
|
66
|
+
"""Create conversations table (thread metadata) if not exists."""
|
|
67
|
+
conn.execute(f"""
|
|
68
|
+
CREATE TABLE IF NOT EXISTS {CONVERSATIONS_TABLE} (
|
|
69
|
+
conversation_id TEXT NOT NULL,
|
|
70
|
+
dataset_id TEXT NOT NULL,
|
|
71
|
+
source_id TEXT,
|
|
72
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
73
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
74
|
+
PRIMARY KEY (conversation_id, dataset_id)
|
|
75
|
+
)
|
|
76
|
+
""")
|
|
77
|
+
conn.execute(f"""
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATIONS_TABLE}_dataset_id
|
|
79
|
+
ON {CONVERSATIONS_TABLE}(dataset_id)
|
|
80
|
+
""")
|
|
81
|
+
conn.execute(f"""
|
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATIONS_TABLE}_source_id
|
|
83
|
+
ON {CONVERSATIONS_TABLE}(source_id)
|
|
84
|
+
""")
|
|
85
|
+
conn.commit()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def ensure_conversation_messages_table(conn) -> None:
|
|
89
|
+
"""Create conversation_messages table (message rows) if not exists. Stage 9: event_at, is_from_self."""
|
|
90
|
+
conn.execute(f"""
|
|
91
|
+
CREATE TABLE IF NOT EXISTS {CONVERSATION_MESSAGES_TABLE} (
|
|
92
|
+
message_id TEXT NOT NULL PRIMARY KEY,
|
|
93
|
+
conversation_id TEXT NOT NULL,
|
|
94
|
+
dataset_id TEXT NOT NULL,
|
|
95
|
+
sender_type TEXT,
|
|
96
|
+
sender_id TEXT,
|
|
97
|
+
reply_to_message_id TEXT,
|
|
98
|
+
message_type TEXT,
|
|
99
|
+
event_type TEXT,
|
|
100
|
+
content TEXT,
|
|
101
|
+
event_at TEXT NOT NULL,
|
|
102
|
+
source_id TEXT NOT NULL,
|
|
103
|
+
metadata_json TEXT,
|
|
104
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
105
|
+
is_from_self INTEGER DEFAULT 0,
|
|
106
|
+
owner_user_id TEXT
|
|
107
|
+
)
|
|
108
|
+
""")
|
|
109
|
+
conn.execute(f"""
|
|
110
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_conversation_id
|
|
111
|
+
ON {CONVERSATION_MESSAGES_TABLE}(conversation_id)
|
|
112
|
+
""")
|
|
113
|
+
conn.execute(f"""
|
|
114
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_dataset_id
|
|
115
|
+
ON {CONVERSATION_MESSAGES_TABLE}(dataset_id)
|
|
116
|
+
""")
|
|
117
|
+
conn.execute(f"""
|
|
118
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_source_id
|
|
119
|
+
ON {CONVERSATION_MESSAGES_TABLE}(source_id)
|
|
120
|
+
""")
|
|
121
|
+
conn.execute(f"""
|
|
122
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_event_at
|
|
123
|
+
ON {CONVERSATION_MESSAGES_TABLE}(event_at)
|
|
124
|
+
""")
|
|
125
|
+
conn.commit()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def ensure_contacts_table(conn) -> None:
|
|
129
|
+
"""Create canonical contacts table if not exists."""
|
|
130
|
+
conn.execute(f"""
|
|
131
|
+
CREATE TABLE IF NOT EXISTS {CONTACTS_TABLE} (
|
|
132
|
+
contact_id TEXT NOT NULL PRIMARY KEY,
|
|
133
|
+
dataset_id TEXT NOT NULL,
|
|
134
|
+
source_id TEXT NOT NULL,
|
|
135
|
+
display_name TEXT,
|
|
136
|
+
known_usernames_json TEXT,
|
|
137
|
+
is_self INTEGER NOT NULL DEFAULT 0,
|
|
138
|
+
last_import_source TEXT,
|
|
139
|
+
last_import_run_id TEXT,
|
|
140
|
+
last_imported_at TEXT,
|
|
141
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
142
|
+
updated_at TEXT DEFAULT (datetime('now'))
|
|
143
|
+
)
|
|
144
|
+
""")
|
|
145
|
+
conn.execute(f"""
|
|
146
|
+
CREATE INDEX IF NOT EXISTS idx_{CONTACTS_TABLE}_dataset_source
|
|
147
|
+
ON {CONTACTS_TABLE}(dataset_id, source_id)
|
|
148
|
+
""")
|
|
149
|
+
conn.execute(f"""
|
|
150
|
+
CREATE INDEX IF NOT EXISTS idx_{CONTACTS_TABLE}_is_self
|
|
151
|
+
ON {CONTACTS_TABLE}(is_self)
|
|
152
|
+
""")
|
|
153
|
+
conn.commit()
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def ensure_contact_identifiers_table(conn) -> None:
|
|
157
|
+
"""Create table mapping contact identifiers (phone/email/service ids)."""
|
|
158
|
+
conn.execute(f"""
|
|
159
|
+
CREATE TABLE IF NOT EXISTS {CONTACT_IDENTIFIERS_TABLE} (
|
|
160
|
+
dataset_id TEXT NOT NULL,
|
|
161
|
+
source_id TEXT NOT NULL,
|
|
162
|
+
identifier TEXT NOT NULL,
|
|
163
|
+
identifier_type TEXT NOT NULL,
|
|
164
|
+
contact_id TEXT NOT NULL,
|
|
165
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
166
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
167
|
+
PRIMARY KEY (dataset_id, source_id, identifier)
|
|
168
|
+
)
|
|
169
|
+
""")
|
|
170
|
+
conn.execute(f"""
|
|
171
|
+
CREATE INDEX IF NOT EXISTS idx_{CONTACT_IDENTIFIERS_TABLE}_contact
|
|
172
|
+
ON {CONTACT_IDENTIFIERS_TABLE}(contact_id)
|
|
173
|
+
""")
|
|
174
|
+
conn.commit()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def ensure_conversation_participants_table(conn) -> None:
|
|
178
|
+
"""Create conversation <-> participant relationship table."""
|
|
179
|
+
conn.execute(f"""
|
|
180
|
+
CREATE TABLE IF NOT EXISTS {CONVERSATION_PARTICIPANTS_TABLE} (
|
|
181
|
+
conversation_id TEXT NOT NULL,
|
|
182
|
+
dataset_id TEXT NOT NULL,
|
|
183
|
+
source_id TEXT NOT NULL,
|
|
184
|
+
contact_id TEXT NOT NULL,
|
|
185
|
+
role TEXT,
|
|
186
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
187
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
188
|
+
PRIMARY KEY (conversation_id, dataset_id, source_id, contact_id)
|
|
189
|
+
)
|
|
190
|
+
""")
|
|
191
|
+
conn.execute(f"""
|
|
192
|
+
CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_PARTICIPANTS_TABLE}_dataset_source
|
|
193
|
+
ON {CONVERSATION_PARTICIPANTS_TABLE}(dataset_id, source_id)
|
|
194
|
+
""")
|
|
195
|
+
conn.commit()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _ensure_signal_identity_columns(conn) -> None:
|
|
199
|
+
"""Add is_from_self and owner_user_id for Signal identity. Stage 9: is_from_self (was from_self). Idempotent."""
|
|
200
|
+
for col, typ in (("is_from_self", "INTEGER DEFAULT 0"), ("owner_user_id", "TEXT")):
|
|
201
|
+
try:
|
|
202
|
+
conn.execute(f"ALTER TABLE {CONVERSATION_MESSAGES_TABLE} ADD COLUMN {col} {typ}")
|
|
203
|
+
conn.commit()
|
|
204
|
+
except Exception as e:
|
|
205
|
+
if "duplicate column" not in str(e).lower():
|
|
206
|
+
logger.debug("Signal identity column %s: %s", col, e)
|
|
207
|
+
conn.rollback()
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _ensure_reply_and_event_columns(conn) -> None:
|
|
211
|
+
"""Add unified reply/system columns for messenger sources. Idempotent."""
|
|
212
|
+
for col, typ in (
|
|
213
|
+
("reply_to_message_id", "TEXT"),
|
|
214
|
+
("message_type", "TEXT"),
|
|
215
|
+
("event_type", "TEXT"),
|
|
216
|
+
):
|
|
217
|
+
try:
|
|
218
|
+
conn.execute(f"ALTER TABLE {CONVERSATION_MESSAGES_TABLE} ADD COLUMN {col} {typ}")
|
|
219
|
+
conn.commit()
|
|
220
|
+
except Exception as e:
|
|
221
|
+
if "duplicate column" not in str(e).lower():
|
|
222
|
+
logger.debug("Messenger context column %s: %s", col, e)
|
|
223
|
+
conn.rollback()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _ensure_contact_provenance_columns(conn) -> None:
|
|
227
|
+
"""Add contact import/profile columns. Idempotent."""
|
|
228
|
+
for col, typ in (
|
|
229
|
+
("known_usernames_json", "TEXT"),
|
|
230
|
+
("last_import_source", "TEXT"),
|
|
231
|
+
("last_import_run_id", "TEXT"),
|
|
232
|
+
("last_imported_at", "TEXT"),
|
|
233
|
+
):
|
|
234
|
+
try:
|
|
235
|
+
conn.execute(f"ALTER TABLE {CONTACTS_TABLE} ADD COLUMN {col} {typ}")
|
|
236
|
+
conn.commit()
|
|
237
|
+
except Exception as e:
|
|
238
|
+
if "duplicate column" not in str(e).lower():
|
|
239
|
+
logger.debug("Contact provenance column %s: %s", col, e)
|
|
240
|
+
conn.rollback()
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _ensure_contact_sharing_policy_column(conn) -> None:
|
|
244
|
+
"""Stage 11: JSON policy for name_visibility / row_visibility per contact."""
|
|
245
|
+
try:
|
|
246
|
+
conn.execute(f"ALTER TABLE {CONTACTS_TABLE} ADD COLUMN sharing_policy_json TEXT")
|
|
247
|
+
conn.commit()
|
|
248
|
+
except Exception as e:
|
|
249
|
+
if "duplicate column" not in str(e).lower():
|
|
250
|
+
logger.debug("Contact sharing_policy_json column: %s", e)
|
|
251
|
+
conn.rollback()
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def ensure_all_tables(conn) -> None:
|
|
255
|
+
"""Ensure both conversations and conversation_messages tables exist.
|
|
256
|
+
Stage 9 column renames run at engine startup (app.py) to avoid blocking the event loop during requests.
|
|
257
|
+
"""
|
|
258
|
+
ensure_conversations_table(conn)
|
|
259
|
+
ensure_conversation_messages_table(conn)
|
|
260
|
+
ensure_contacts_table(conn)
|
|
261
|
+
ensure_contact_identifiers_table(conn)
|
|
262
|
+
ensure_conversation_participants_table(conn)
|
|
263
|
+
_ensure_reply_and_event_columns(conn)
|
|
264
|
+
_ensure_signal_identity_columns(conn)
|
|
265
|
+
_ensure_contact_provenance_columns(conn)
|
|
266
|
+
_ensure_contact_sharing_policy_column(conn)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class ConversationsTablesManager:
|
|
270
|
+
"""Writes to conversations and conversation_messages only (canonical_group_id=conversations)."""
|
|
271
|
+
|
|
272
|
+
def __init__(self, conn) -> None:
|
|
273
|
+
self.conn = conn
|
|
274
|
+
|
|
275
|
+
def ensure_tables(self) -> None:
|
|
276
|
+
"""Create conversations and conversation_messages tables if not exist."""
|
|
277
|
+
if self.conn:
|
|
278
|
+
ensure_all_tables(self.conn)
|
|
279
|
+
|
|
280
|
+
def upsert_conversation(
|
|
281
|
+
self,
|
|
282
|
+
conversation_id: str,
|
|
283
|
+
dataset_id: str,
|
|
284
|
+
source_id: Optional[str] = None,
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Insert or replace one row in conversations."""
|
|
287
|
+
if not self.conn:
|
|
288
|
+
return
|
|
289
|
+
self.ensure_tables()
|
|
290
|
+
self.conn.execute(f"""
|
|
291
|
+
INSERT OR REPLACE INTO {CONVERSATIONS_TABLE}
|
|
292
|
+
(conversation_id, dataset_id, source_id, created_at, updated_at)
|
|
293
|
+
VALUES (?, ?, ?, datetime('now'), datetime('now'))
|
|
294
|
+
""", (conversation_id, dataset_id, source_id or ""))
|
|
295
|
+
self.conn.commit()
|
|
296
|
+
|
|
297
|
+
def upsert_message_batch(
|
|
298
|
+
self,
|
|
299
|
+
records: List[Dict[str, Any]],
|
|
300
|
+
dataset_id: str,
|
|
301
|
+
source_id: str,
|
|
302
|
+
) -> Dict[str, int]:
|
|
303
|
+
"""
|
|
304
|
+
Upsert messages into conversation_messages and ensure parent rows in conversations.
|
|
305
|
+
Each record must have: message_id, thread_id or conversation_id, ts, sender_type, content.
|
|
306
|
+
Optional: sender_id, _metadata, from_self (0/1), owner_user_id (for Signal identity).
|
|
307
|
+
"""
|
|
308
|
+
if not self.conn or not records:
|
|
309
|
+
return {"messages_created": 0, "conversations_created": 0}
|
|
310
|
+
self.ensure_tables()
|
|
311
|
+
|
|
312
|
+
def _normalize_identifier_type(identifier: str) -> str:
|
|
313
|
+
low = (identifier or "").lower()
|
|
314
|
+
if low.startswith("+") or low.replace("-", "").replace(" ", "").isdigit():
|
|
315
|
+
return "phone"
|
|
316
|
+
if "@" in low:
|
|
317
|
+
return "email"
|
|
318
|
+
if ":" in low or len(low) > 24:
|
|
319
|
+
return "service_id"
|
|
320
|
+
return "handle"
|
|
321
|
+
|
|
322
|
+
def _merge_known_usernames_json(
|
|
323
|
+
existing_json: Optional[str],
|
|
324
|
+
candidates: List[str],
|
|
325
|
+
) -> Optional[str]:
|
|
326
|
+
norm: List[str] = []
|
|
327
|
+
seen: set[str] = set()
|
|
328
|
+
try:
|
|
329
|
+
existing = json.loads(existing_json or "[]")
|
|
330
|
+
if not isinstance(existing, list):
|
|
331
|
+
existing = []
|
|
332
|
+
except Exception:
|
|
333
|
+
existing = []
|
|
334
|
+
for raw in list(existing) + list(candidates):
|
|
335
|
+
val = str(raw or "").strip()
|
|
336
|
+
if not val or len(val) > 128:
|
|
337
|
+
continue
|
|
338
|
+
low = val.lower()
|
|
339
|
+
if low in seen:
|
|
340
|
+
continue
|
|
341
|
+
seen.add(low)
|
|
342
|
+
norm.append(val)
|
|
343
|
+
return json.dumps(norm, ensure_ascii=False) if norm else None
|
|
344
|
+
|
|
345
|
+
def _extract_display_name(rec: Dict[str, Any]) -> Optional[str]:
|
|
346
|
+
metadata = rec.get("_metadata")
|
|
347
|
+
if isinstance(metadata, dict):
|
|
348
|
+
for key in (
|
|
349
|
+
"sender_name",
|
|
350
|
+
"display_name",
|
|
351
|
+
"contact_name",
|
|
352
|
+
"profileName",
|
|
353
|
+
"profile_name",
|
|
354
|
+
"quoteAuthor",
|
|
355
|
+
):
|
|
356
|
+
val = metadata.get(key)
|
|
357
|
+
if isinstance(val, str) and val.strip():
|
|
358
|
+
return val.strip()
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
def _extract_username_candidates(rec: Dict[str, Any], sender_identifier_type: str) -> List[str]:
|
|
362
|
+
out: List[str] = []
|
|
363
|
+
sender_id = str(rec.get("sender_id") or "").strip()
|
|
364
|
+
if sender_id and sender_identifier_type in {"handle", "service_id"} and "@" not in sender_id:
|
|
365
|
+
out.append(sender_id)
|
|
366
|
+
metadata = rec.get("_metadata")
|
|
367
|
+
if isinstance(metadata, dict):
|
|
368
|
+
for key in ("username", "handle", "profileName", "profile_name"):
|
|
369
|
+
val = metadata.get(key)
|
|
370
|
+
if isinstance(val, str) and val.strip():
|
|
371
|
+
out.append(val.strip())
|
|
372
|
+
return out
|
|
373
|
+
|
|
374
|
+
def _upsert_contact(rec: Dict[str, Any]) -> Optional[str]:
|
|
375
|
+
sender_id = str(rec.get("sender_id") or "").strip()
|
|
376
|
+
if not sender_id:
|
|
377
|
+
return None
|
|
378
|
+
sender_identifier_type = _normalize_identifier_type(sender_id)
|
|
379
|
+
is_self = 1 if (rec.get("from_self") is True or rec.get("is_from_self") is True or sender_id.lower() == "self") else 0
|
|
380
|
+
row = self.conn.execute(
|
|
381
|
+
f"""
|
|
382
|
+
SELECT contact_id
|
|
383
|
+
FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
384
|
+
WHERE dataset_id = ?
|
|
385
|
+
AND identifier = ?
|
|
386
|
+
AND source_id IN (?, '*')
|
|
387
|
+
ORDER BY CASE WHEN source_id = ? THEN 0 ELSE 1 END
|
|
388
|
+
LIMIT 1
|
|
389
|
+
""",
|
|
390
|
+
(dataset_id, sender_id, source_id, source_id),
|
|
391
|
+
).fetchone()
|
|
392
|
+
if row and row[0]:
|
|
393
|
+
contact_id = str(row[0])
|
|
394
|
+
else:
|
|
395
|
+
key = f"{sender_identifier_type}:{sender_id}"
|
|
396
|
+
digest = hashlib.sha1(key.encode("utf-8")).hexdigest()[:20]
|
|
397
|
+
contact_id = f"{dataset_id}:contact:{digest}"
|
|
398
|
+
display_name = _extract_display_name(rec)
|
|
399
|
+
usernames_json = _merge_known_usernames_json(
|
|
400
|
+
(
|
|
401
|
+
self.conn.execute(
|
|
402
|
+
f"SELECT known_usernames_json FROM {CONTACTS_TABLE} WHERE contact_id = ? LIMIT 1",
|
|
403
|
+
(contact_id,),
|
|
404
|
+
).fetchone() or [None]
|
|
405
|
+
)[0],
|
|
406
|
+
_extract_username_candidates(rec, sender_identifier_type),
|
|
407
|
+
)
|
|
408
|
+
self.conn.execute(
|
|
409
|
+
f"""
|
|
410
|
+
INSERT INTO {CONTACTS_TABLE}
|
|
411
|
+
(contact_id, dataset_id, source_id, display_name, known_usernames_json, is_self, last_import_source, last_import_run_id, last_imported_at, created_at, updated_at)
|
|
412
|
+
VALUES (?, ?, 'global', ?, ?, ?, NULL, NULL, NULL, datetime('now'), datetime('now'))
|
|
413
|
+
ON CONFLICT(contact_id) DO UPDATE SET
|
|
414
|
+
display_name = COALESCE(excluded.display_name, {CONTACTS_TABLE}.display_name),
|
|
415
|
+
known_usernames_json = COALESCE(excluded.known_usernames_json, {CONTACTS_TABLE}.known_usernames_json),
|
|
416
|
+
is_self = CASE WHEN excluded.is_self = 1 THEN 1 ELSE {CONTACTS_TABLE}.is_self END,
|
|
417
|
+
updated_at = datetime('now')
|
|
418
|
+
""",
|
|
419
|
+
(contact_id, dataset_id, display_name, usernames_json, is_self),
|
|
420
|
+
)
|
|
421
|
+
self.conn.execute(
|
|
422
|
+
f"""
|
|
423
|
+
INSERT INTO {CONTACT_IDENTIFIERS_TABLE}
|
|
424
|
+
(dataset_id, source_id, identifier, identifier_type, contact_id, created_at, updated_at)
|
|
425
|
+
VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
|
|
426
|
+
ON CONFLICT(dataset_id, source_id, identifier) DO UPDATE SET
|
|
427
|
+
contact_id = excluded.contact_id,
|
|
428
|
+
updated_at = datetime('now')
|
|
429
|
+
""",
|
|
430
|
+
(
|
|
431
|
+
dataset_id,
|
|
432
|
+
source_id,
|
|
433
|
+
sender_id,
|
|
434
|
+
sender_identifier_type,
|
|
435
|
+
contact_id,
|
|
436
|
+
),
|
|
437
|
+
)
|
|
438
|
+
return contact_id
|
|
439
|
+
|
|
440
|
+
conversations_created = 0
|
|
441
|
+
seen_conversation_ids: set[tuple[str, str]] = set()
|
|
442
|
+
participants_seen: set[tuple[str, str]] = set()
|
|
443
|
+
for rec in records:
|
|
444
|
+
conversation_id = (
|
|
445
|
+
str(rec.get("conversation_id") or rec.get("thread_id") or dataset_id)
|
|
446
|
+
)
|
|
447
|
+
key = (conversation_id, dataset_id)
|
|
448
|
+
if key not in seen_conversation_ids:
|
|
449
|
+
self.upsert_conversation(conversation_id, dataset_id, source_id)
|
|
450
|
+
seen_conversation_ids.add(key)
|
|
451
|
+
conversations_created += 1
|
|
452
|
+
contact_id = _upsert_contact(rec)
|
|
453
|
+
if contact_id:
|
|
454
|
+
part_key = (conversation_id, contact_id)
|
|
455
|
+
if part_key not in participants_seen:
|
|
456
|
+
self.conn.execute(
|
|
457
|
+
f"""
|
|
458
|
+
INSERT INTO {CONVERSATION_PARTICIPANTS_TABLE}
|
|
459
|
+
(conversation_id, dataset_id, source_id, contact_id, role, created_at, updated_at)
|
|
460
|
+
VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
|
|
461
|
+
ON CONFLICT(conversation_id, dataset_id, source_id, contact_id) DO UPDATE SET
|
|
462
|
+
role = COALESCE(excluded.role, {CONVERSATION_PARTICIPANTS_TABLE}.role),
|
|
463
|
+
updated_at = datetime('now')
|
|
464
|
+
""",
|
|
465
|
+
(
|
|
466
|
+
conversation_id,
|
|
467
|
+
dataset_id,
|
|
468
|
+
source_id,
|
|
469
|
+
contact_id,
|
|
470
|
+
"self" if (rec.get("from_self") or rec.get("is_from_self")) else "participant",
|
|
471
|
+
),
|
|
472
|
+
)
|
|
473
|
+
participants_seen.add(part_key)
|
|
474
|
+
for rec in records:
|
|
475
|
+
message_id = str(rec.get("message_id") or "")
|
|
476
|
+
if not message_id:
|
|
477
|
+
continue
|
|
478
|
+
conversation_id = (
|
|
479
|
+
str(rec.get("conversation_id") or rec.get("thread_id") or dataset_id)
|
|
480
|
+
)
|
|
481
|
+
event_at = rec.get("event_at") or rec.get("ts") or ""
|
|
482
|
+
sender_type = rec.get("sender_type")
|
|
483
|
+
sender_id = rec.get("sender_id")
|
|
484
|
+
reply_to_message_id = rec.get("reply_to_message_id")
|
|
485
|
+
message_type = rec.get("message_type")
|
|
486
|
+
event_type = rec.get("event_type")
|
|
487
|
+
content = rec.get("content")
|
|
488
|
+
metadata_json = None
|
|
489
|
+
if "_metadata" in rec:
|
|
490
|
+
metadata_json = json.dumps(rec["_metadata"], ensure_ascii=False)
|
|
491
|
+
is_from_self = 1 if (rec.get("is_from_self") is True or rec.get("from_self") is True) else 0
|
|
492
|
+
owner_user_id = rec.get("owner_user_id")
|
|
493
|
+
self.conn.execute(f"""
|
|
494
|
+
INSERT OR REPLACE INTO {CONVERSATION_MESSAGES_TABLE}
|
|
495
|
+
(message_id, conversation_id, dataset_id, sender_type, sender_id, reply_to_message_id, message_type, event_type, content, event_at, source_id, metadata_json, is_from_self, owner_user_id, created_at)
|
|
496
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
|
|
497
|
+
""", (
|
|
498
|
+
message_id,
|
|
499
|
+
conversation_id,
|
|
500
|
+
dataset_id,
|
|
501
|
+
sender_type,
|
|
502
|
+
sender_id,
|
|
503
|
+
reply_to_message_id,
|
|
504
|
+
message_type,
|
|
505
|
+
event_type,
|
|
506
|
+
content,
|
|
507
|
+
event_at,
|
|
508
|
+
source_id,
|
|
509
|
+
metadata_json,
|
|
510
|
+
is_from_self,
|
|
511
|
+
owner_user_id,
|
|
512
|
+
))
|
|
513
|
+
self.conn.commit()
|
|
514
|
+
logger.debug(
|
|
515
|
+
"[PIPELINE:CONVERSATIONS] Wrote %d messages to %s, %d conversation rows",
|
|
516
|
+
len(records),
|
|
517
|
+
CONVERSATION_MESSAGES_TABLE,
|
|
518
|
+
len(seen_conversation_ids),
|
|
519
|
+
)
|
|
520
|
+
return {"messages_created": len(records), "conversations_created": conversations_created}
|
|
521
|
+
|
|
522
|
+
def list_contacts(
|
|
523
|
+
self,
|
|
524
|
+
*,
|
|
525
|
+
dataset_id: str,
|
|
526
|
+
source_id: str,
|
|
527
|
+
limit: int = 200,
|
|
528
|
+
) -> List[Dict[str, Any]]:
|
|
529
|
+
"""List contacts with primary identifier and message counts for a source."""
|
|
530
|
+
if not self.conn:
|
|
531
|
+
return []
|
|
532
|
+
self.ensure_tables()
|
|
533
|
+
rows = self.conn.execute(
|
|
534
|
+
f"""
|
|
535
|
+
SELECT c.contact_id,
|
|
536
|
+
c.display_name,
|
|
537
|
+
c.known_usernames_json,
|
|
538
|
+
c.sharing_policy_json,
|
|
539
|
+
c.is_self,
|
|
540
|
+
c.last_import_source,
|
|
541
|
+
c.last_import_run_id,
|
|
542
|
+
c.last_imported_at,
|
|
543
|
+
ci.identifier AS primary_identifier,
|
|
544
|
+
ci.identifier_type AS primary_identifier_type,
|
|
545
|
+
COUNT(m.message_id) AS message_count
|
|
546
|
+
FROM {CONTACTS_TABLE} c
|
|
547
|
+
LEFT JOIN {CONTACT_IDENTIFIERS_TABLE} ci
|
|
548
|
+
ON ci.contact_id = c.contact_id
|
|
549
|
+
AND ci.dataset_id = c.dataset_id
|
|
550
|
+
AND ci.source_id IN (?, '*')
|
|
551
|
+
LEFT JOIN {CONVERSATION_MESSAGES_TABLE} m
|
|
552
|
+
ON m.dataset_id = c.dataset_id
|
|
553
|
+
AND m.source_id = ?
|
|
554
|
+
AND m.sender_id = ci.identifier
|
|
555
|
+
WHERE c.dataset_id = ?
|
|
556
|
+
GROUP BY c.contact_id, c.display_name, c.known_usernames_json, c.sharing_policy_json, c.is_self, c.last_import_source, c.last_import_run_id, c.last_imported_at, ci.identifier, ci.identifier_type
|
|
557
|
+
ORDER BY c.is_self DESC, message_count DESC, c.updated_at DESC
|
|
558
|
+
LIMIT ?
|
|
559
|
+
""",
|
|
560
|
+
(source_id, source_id, dataset_id, int(limit)),
|
|
561
|
+
).fetchall()
|
|
562
|
+
out: List[Dict[str, Any]] = []
|
|
563
|
+
for row in rows:
|
|
564
|
+
try:
|
|
565
|
+
usernames = json.loads(row[2] or "[]")
|
|
566
|
+
if not isinstance(usernames, list):
|
|
567
|
+
usernames = []
|
|
568
|
+
except Exception:
|
|
569
|
+
usernames = []
|
|
570
|
+
pol_raw = row[3]
|
|
571
|
+
try:
|
|
572
|
+
sharing_policy = json.loads(pol_raw) if pol_raw else {}
|
|
573
|
+
if not isinstance(sharing_policy, dict):
|
|
574
|
+
sharing_policy = {}
|
|
575
|
+
except Exception:
|
|
576
|
+
sharing_policy = {}
|
|
577
|
+
out.append(
|
|
578
|
+
{
|
|
579
|
+
"contact_id": row[0],
|
|
580
|
+
"display_name": row[1],
|
|
581
|
+
"known_usernames": usernames,
|
|
582
|
+
"sharing_policy": sharing_policy,
|
|
583
|
+
"is_self": bool(row[4]),
|
|
584
|
+
"last_import_source": row[5],
|
|
585
|
+
"last_import_run_id": row[6],
|
|
586
|
+
"last_imported_at": row[7],
|
|
587
|
+
"identifier": row[8],
|
|
588
|
+
"identifier_type": row[9],
|
|
589
|
+
"message_count": int(row[10] or 0),
|
|
590
|
+
}
|
|
591
|
+
)
|
|
592
|
+
return out
|
|
593
|
+
|
|
594
|
+
def get_contact_message_samples(
|
|
595
|
+
self,
|
|
596
|
+
*,
|
|
597
|
+
dataset_id: str,
|
|
598
|
+
source_id: str,
|
|
599
|
+
identifier: str,
|
|
600
|
+
limit: int = 5,
|
|
601
|
+
) -> List[Dict[str, Any]]:
|
|
602
|
+
"""Return the most recent sample messages for identifier (newest first)."""
|
|
603
|
+
if not self.conn or not identifier:
|
|
604
|
+
return []
|
|
605
|
+
lim = max(1, int(limit))
|
|
606
|
+
# Pull a wider candidate set: string ORDER BY can disagree with true time order.
|
|
607
|
+
cap = min(500, max(60, lim * 25))
|
|
608
|
+
rows = self.conn.execute(
|
|
609
|
+
f"""
|
|
610
|
+
SELECT message_id, content, event_at, conversation_id, created_at
|
|
611
|
+
FROM {CONVERSATION_MESSAGES_TABLE}
|
|
612
|
+
WHERE dataset_id = ?
|
|
613
|
+
AND source_id = ?
|
|
614
|
+
AND sender_id = ?
|
|
615
|
+
ORDER BY event_at DESC, created_at DESC, message_id DESC
|
|
616
|
+
LIMIT ?
|
|
617
|
+
""",
|
|
618
|
+
(dataset_id, source_id, identifier, cap),
|
|
619
|
+
).fetchall()
|
|
620
|
+
sorted_rows = sorted(
|
|
621
|
+
rows,
|
|
622
|
+
key=lambda r: (
|
|
623
|
+
_message_timestamp_unix_for_sort(
|
|
624
|
+
str(r[2]) if r[2] is not None else None,
|
|
625
|
+
str(r[4]) if len(r) > 4 and r[4] is not None else None,
|
|
626
|
+
),
|
|
627
|
+
str(r[0] or ""),
|
|
628
|
+
),
|
|
629
|
+
reverse=True,
|
|
630
|
+
)[:lim]
|
|
631
|
+
return [
|
|
632
|
+
{
|
|
633
|
+
"message_id": row[0],
|
|
634
|
+
"content": row[1],
|
|
635
|
+
"event_at": row[2],
|
|
636
|
+
"conversation_id": row[3],
|
|
637
|
+
}
|
|
638
|
+
for row in sorted_rows
|
|
639
|
+
]
|
|
640
|
+
|
|
641
|
+
def get_contact_conversation_thread_previews(
|
|
642
|
+
self,
|
|
643
|
+
*,
|
|
644
|
+
dataset_id: str,
|
|
645
|
+
source_id: str,
|
|
646
|
+
profile_identifier: str,
|
|
647
|
+
max_conversations: int = 8,
|
|
648
|
+
messages_per_conversation: int = 45,
|
|
649
|
+
) -> List[Dict[str, Any]]:
|
|
650
|
+
"""Recent slices of full threads where ``profile_identifier`` sent at least one message.
|
|
651
|
+
|
|
652
|
+
Each block lists messages from all participants in that conversation (not only the profile
|
|
653
|
+
contact), ordered newest→oldest within the window (the ``messages_per_conversation`` most
|
|
654
|
+
recent rows in the thread).
|
|
655
|
+
"""
|
|
656
|
+
if not self.conn or not profile_identifier:
|
|
657
|
+
return []
|
|
658
|
+
self.ensure_tables()
|
|
659
|
+
mc = max(1, int(max_conversations))
|
|
660
|
+
mpc = max(1, int(messages_per_conversation))
|
|
661
|
+
|
|
662
|
+
rows = self.conn.execute(
|
|
663
|
+
f"""
|
|
664
|
+
SELECT conversation_id, event_at, created_at
|
|
665
|
+
FROM {CONVERSATION_MESSAGES_TABLE}
|
|
666
|
+
WHERE dataset_id = ?
|
|
667
|
+
AND source_id = ?
|
|
668
|
+
AND sender_id = ?
|
|
669
|
+
""",
|
|
670
|
+
(dataset_id, source_id, profile_identifier),
|
|
671
|
+
).fetchall()
|
|
672
|
+
best_ts: Dict[str, float] = {}
|
|
673
|
+
for conv_id, event_at, created_at in rows:
|
|
674
|
+
cid = str(conv_id or "").strip()
|
|
675
|
+
if not cid:
|
|
676
|
+
continue
|
|
677
|
+
ts = _message_timestamp_unix_for_sort(
|
|
678
|
+
str(event_at) if event_at is not None else None,
|
|
679
|
+
str(created_at) if created_at is not None else None,
|
|
680
|
+
)
|
|
681
|
+
best_ts[cid] = max(best_ts.get(cid, 0.0), ts)
|
|
682
|
+
sorted_conv_ids = sorted(best_ts.keys(), key=lambda c: best_ts[c], reverse=True)[:mc]
|
|
683
|
+
|
|
684
|
+
out: List[Dict[str, Any]] = []
|
|
685
|
+
for cid in sorted_conv_ids:
|
|
686
|
+
all_rows = self.conn.execute(
|
|
687
|
+
f"""
|
|
688
|
+
SELECT message_id, content, event_at, conversation_id, created_at, sender_id, is_from_self
|
|
689
|
+
FROM {CONVERSATION_MESSAGES_TABLE}
|
|
690
|
+
WHERE dataset_id = ?
|
|
691
|
+
AND source_id = ?
|
|
692
|
+
AND conversation_id = ?
|
|
693
|
+
""",
|
|
694
|
+
(dataset_id, source_id, cid),
|
|
695
|
+
).fetchall()
|
|
696
|
+
sorted_all = sorted(
|
|
697
|
+
all_rows,
|
|
698
|
+
key=lambda r: (
|
|
699
|
+
_message_timestamp_unix_for_sort(
|
|
700
|
+
str(r[2]) if r[2] is not None else None,
|
|
701
|
+
str(r[4]) if len(r) > 4 and r[4] is not None else None,
|
|
702
|
+
),
|
|
703
|
+
str(r[0] or ""),
|
|
704
|
+
),
|
|
705
|
+
)
|
|
706
|
+
window = sorted_all[-mpc:] if len(sorted_all) > mpc else sorted_all
|
|
707
|
+
messages: List[Dict[str, Any]] = []
|
|
708
|
+
for r in reversed(window):
|
|
709
|
+
is_self = r[6]
|
|
710
|
+
messages.append(
|
|
711
|
+
{
|
|
712
|
+
"message_id": r[0],
|
|
713
|
+
"content": r[1],
|
|
714
|
+
"event_at": r[2],
|
|
715
|
+
"conversation_id": r[3],
|
|
716
|
+
"created_at": r[4],
|
|
717
|
+
"sender_id": r[5],
|
|
718
|
+
"is_from_self": bool(is_self) if is_self is not None else False,
|
|
719
|
+
}
|
|
720
|
+
)
|
|
721
|
+
out.append({"conversation_id": cid, "messages": messages})
|
|
722
|
+
return out
|
|
723
|
+
|
|
724
|
+
def update_contact_display_name(
|
|
725
|
+
self,
|
|
726
|
+
*,
|
|
727
|
+
dataset_id: str,
|
|
728
|
+
source_id: str,
|
|
729
|
+
contact_id: str,
|
|
730
|
+
display_name: Optional[str],
|
|
731
|
+
) -> None:
|
|
732
|
+
"""Set/clear display name for contact."""
|
|
733
|
+
if not self.conn:
|
|
734
|
+
return
|
|
735
|
+
self.conn.execute(
|
|
736
|
+
f"""
|
|
737
|
+
UPDATE {CONTACTS_TABLE}
|
|
738
|
+
SET display_name = ?, last_import_source = 'manual_edit', updated_at = datetime('now')
|
|
739
|
+
WHERE dataset_id = ? AND contact_id = ?
|
|
740
|
+
""",
|
|
741
|
+
((display_name or None), dataset_id, contact_id),
|
|
742
|
+
)
|
|
743
|
+
self.conn.commit()
|
|
744
|
+
|
|
745
|
+
def update_contact_sharing_policy(
|
|
746
|
+
self,
|
|
747
|
+
*,
|
|
748
|
+
dataset_id: str,
|
|
749
|
+
contact_id: str,
|
|
750
|
+
sharing_policy: Optional[Dict[str, Any]],
|
|
751
|
+
) -> None:
|
|
752
|
+
"""Stage 11: set sharing_policy_json (name_visibility, row_visibility)."""
|
|
753
|
+
if not self.conn:
|
|
754
|
+
return
|
|
755
|
+
self.ensure_tables()
|
|
756
|
+
payload = json.dumps(sharing_policy or {}) if sharing_policy else None
|
|
757
|
+
self.conn.execute(
|
|
758
|
+
f"""
|
|
759
|
+
UPDATE {CONTACTS_TABLE}
|
|
760
|
+
SET sharing_policy_json = ?, updated_at = datetime('now')
|
|
761
|
+
WHERE dataset_id = ? AND contact_id = ?
|
|
762
|
+
""",
|
|
763
|
+
(payload, dataset_id, contact_id),
|
|
764
|
+
)
|
|
765
|
+
self.conn.commit()
|
|
766
|
+
|
|
767
|
+
def auto_resolve_contact_names(
|
|
768
|
+
self,
|
|
769
|
+
*,
|
|
770
|
+
dataset_id: str,
|
|
771
|
+
source_id: str,
|
|
772
|
+
) -> int:
|
|
773
|
+
"""Best-effort fill display_name from message metadata."""
|
|
774
|
+
if not self.conn:
|
|
775
|
+
return 0
|
|
776
|
+
candidates = self.conn.execute(
|
|
777
|
+
f"""
|
|
778
|
+
SELECT sender_id, metadata_json
|
|
779
|
+
FROM {CONVERSATION_MESSAGES_TABLE}
|
|
780
|
+
WHERE dataset_id = ?
|
|
781
|
+
AND source_id = ?
|
|
782
|
+
AND sender_id IS NOT NULL
|
|
783
|
+
AND sender_id != ''
|
|
784
|
+
AND metadata_json IS NOT NULL
|
|
785
|
+
ORDER BY event_at DESC
|
|
786
|
+
""",
|
|
787
|
+
(dataset_id, source_id),
|
|
788
|
+
).fetchall()
|
|
789
|
+
updated = 0
|
|
790
|
+
seen: set[str] = set()
|
|
791
|
+
for sender_id, metadata_json in candidates:
|
|
792
|
+
sid = str(sender_id or "").strip()
|
|
793
|
+
if not sid or sid in seen:
|
|
794
|
+
continue
|
|
795
|
+
seen.add(sid)
|
|
796
|
+
try:
|
|
797
|
+
md = json.loads(metadata_json or "{}")
|
|
798
|
+
except Exception:
|
|
799
|
+
continue
|
|
800
|
+
display_name = None
|
|
801
|
+
if isinstance(md, dict):
|
|
802
|
+
for key in (
|
|
803
|
+
"sender_name",
|
|
804
|
+
"display_name",
|
|
805
|
+
"contact_name",
|
|
806
|
+
"profileName",
|
|
807
|
+
"profile_name",
|
|
808
|
+
"quoteAuthor",
|
|
809
|
+
):
|
|
810
|
+
val = md.get(key)
|
|
811
|
+
if isinstance(val, str) and val.strip():
|
|
812
|
+
display_name = val.strip()
|
|
813
|
+
break
|
|
814
|
+
if not display_name:
|
|
815
|
+
continue
|
|
816
|
+
row = self.conn.execute(
|
|
817
|
+
f"""
|
|
818
|
+
SELECT contact_id
|
|
819
|
+
FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
820
|
+
WHERE dataset_id = ?
|
|
821
|
+
AND identifier = ?
|
|
822
|
+
AND source_id IN (?, '*')
|
|
823
|
+
ORDER BY CASE WHEN source_id = ? THEN 0 ELSE 1 END
|
|
824
|
+
LIMIT 1
|
|
825
|
+
""",
|
|
826
|
+
(dataset_id, sid, source_id, source_id),
|
|
827
|
+
).fetchone()
|
|
828
|
+
if not row or not row[0]:
|
|
829
|
+
continue
|
|
830
|
+
contact_id = str(row[0])
|
|
831
|
+
cursor = self.conn.execute(
|
|
832
|
+
f"""
|
|
833
|
+
UPDATE {CONTACTS_TABLE}
|
|
834
|
+
SET display_name = COALESCE(display_name, ?),
|
|
835
|
+
last_import_source = CASE WHEN display_name IS NULL THEN 'auto_resolve' ELSE last_import_source END,
|
|
836
|
+
updated_at = datetime('now')
|
|
837
|
+
WHERE dataset_id = ? AND contact_id = ?
|
|
838
|
+
""",
|
|
839
|
+
(display_name, dataset_id, contact_id),
|
|
840
|
+
)
|
|
841
|
+
if int(cursor.rowcount or 0) > 0:
|
|
842
|
+
updated += 1
|
|
843
|
+
if updated:
|
|
844
|
+
self.conn.commit()
|
|
845
|
+
return updated
|
|
846
|
+
|
|
847
|
+
def import_contacts_batch(
|
|
848
|
+
self,
|
|
849
|
+
*,
|
|
850
|
+
dataset_id: str,
|
|
851
|
+
contacts: List[Dict[str, Any]],
|
|
852
|
+
source_id: Optional[str] = None,
|
|
853
|
+
target_sources: Optional[List[str]] = None,
|
|
854
|
+
import_source: Optional[str] = None,
|
|
855
|
+
import_run_id: Optional[str] = None,
|
|
856
|
+
) -> Dict[str, int]:
|
|
857
|
+
"""
|
|
858
|
+
Upsert external contacts into canonical contacts tables.
|
|
859
|
+
Does not overwrite existing non-null display names.
|
|
860
|
+
"""
|
|
861
|
+
if not self.conn or not contacts:
|
|
862
|
+
return {"contacts_upserted": 0, "identifiers_upserted": 0}
|
|
863
|
+
self.ensure_tables()
|
|
864
|
+
import_source_value = str(import_source or "").strip() or None
|
|
865
|
+
import_run_id_value = str(import_run_id or "").strip() or None
|
|
866
|
+
scoped_sources = sorted(
|
|
867
|
+
{
|
|
868
|
+
s
|
|
869
|
+
for s in ([str(source_id or "").strip()] + [str(s or "").strip() for s in (target_sources or [])])
|
|
870
|
+
if s
|
|
871
|
+
}
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
def _normalize_identifier_type(identifier: str, explicit_type: Optional[str]) -> str:
|
|
875
|
+
if explicit_type and str(explicit_type).strip():
|
|
876
|
+
return str(explicit_type).strip().lower()
|
|
877
|
+
low = (identifier or "").lower()
|
|
878
|
+
if low.startswith("+") or low.replace("-", "").replace(" ", "").isdigit():
|
|
879
|
+
return "phone"
|
|
880
|
+
if "@" in low:
|
|
881
|
+
return "email"
|
|
882
|
+
if ":" in low or len(low) > 24:
|
|
883
|
+
return "service_id"
|
|
884
|
+
return "handle"
|
|
885
|
+
|
|
886
|
+
def _merge_known_usernames_json(existing_json: Optional[str], candidates: List[str]) -> Optional[str]:
|
|
887
|
+
merged: List[str] = []
|
|
888
|
+
seen: set[str] = set()
|
|
889
|
+
try:
|
|
890
|
+
existing = json.loads(existing_json or "[]")
|
|
891
|
+
if not isinstance(existing, list):
|
|
892
|
+
existing = []
|
|
893
|
+
except Exception:
|
|
894
|
+
existing = []
|
|
895
|
+
for raw in list(existing) + list(candidates):
|
|
896
|
+
val = str(raw or "").strip()
|
|
897
|
+
if not val or len(val) > 128:
|
|
898
|
+
continue
|
|
899
|
+
low = val.lower()
|
|
900
|
+
if low in seen:
|
|
901
|
+
continue
|
|
902
|
+
seen.add(low)
|
|
903
|
+
merged.append(val)
|
|
904
|
+
return json.dumps(merged, ensure_ascii=False) if merged else None
|
|
905
|
+
|
|
906
|
+
contacts_upserted = 0
|
|
907
|
+
identifiers_upserted = 0
|
|
908
|
+
for rec in contacts:
|
|
909
|
+
identifiers_raw = rec.get("identifiers") or []
|
|
910
|
+
pairs: List[tuple[str, str]] = []
|
|
911
|
+
for item in identifiers_raw:
|
|
912
|
+
if isinstance(item, dict):
|
|
913
|
+
identifier = str(item.get("identifier") or "").strip()
|
|
914
|
+
itype = _normalize_identifier_type(identifier, item.get("type"))
|
|
915
|
+
else:
|
|
916
|
+
identifier = str(item or "").strip()
|
|
917
|
+
itype = _normalize_identifier_type(identifier, None)
|
|
918
|
+
if identifier:
|
|
919
|
+
pairs.append((identifier, itype))
|
|
920
|
+
if not pairs:
|
|
921
|
+
continue
|
|
922
|
+
|
|
923
|
+
# Prefer existing contact mapping for any known identifier.
|
|
924
|
+
contact_id = None
|
|
925
|
+
for identifier, _ in pairs:
|
|
926
|
+
row = self.conn.execute(
|
|
927
|
+
f"""
|
|
928
|
+
SELECT contact_id FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
929
|
+
WHERE dataset_id = ? AND identifier = ?
|
|
930
|
+
LIMIT 1
|
|
931
|
+
""",
|
|
932
|
+
(dataset_id, identifier),
|
|
933
|
+
).fetchone()
|
|
934
|
+
if row and row[0]:
|
|
935
|
+
contact_id = str(row[0])
|
|
936
|
+
break
|
|
937
|
+
if not contact_id:
|
|
938
|
+
key = "|".join(sorted({f"{t}:{i}" for i, t in pairs}))
|
|
939
|
+
digest = hashlib.sha1(key.encode("utf-8")).hexdigest()[:20]
|
|
940
|
+
contact_id = f"{dataset_id}:contact:import:{digest}"
|
|
941
|
+
display_name = rec.get("display_name")
|
|
942
|
+
if display_name is not None:
|
|
943
|
+
display_name = str(display_name).strip() or None
|
|
944
|
+
username_candidates = [
|
|
945
|
+
i for i, t in pairs
|
|
946
|
+
if t in {"handle", "service_id"} and "@" not in i and len(i) <= 128
|
|
947
|
+
]
|
|
948
|
+
existing_profile = self.conn.execute(
|
|
949
|
+
f"""
|
|
950
|
+
SELECT known_usernames_json
|
|
951
|
+
FROM {CONTACTS_TABLE}
|
|
952
|
+
WHERE contact_id = ?
|
|
953
|
+
LIMIT 1
|
|
954
|
+
""",
|
|
955
|
+
(contact_id,),
|
|
956
|
+
).fetchone()
|
|
957
|
+
usernames_json = _merge_known_usernames_json(
|
|
958
|
+
existing_profile[0] if existing_profile else None,
|
|
959
|
+
username_candidates,
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
self.conn.execute(
|
|
963
|
+
f"""
|
|
964
|
+
INSERT INTO {CONTACTS_TABLE}
|
|
965
|
+
(
|
|
966
|
+
contact_id,
|
|
967
|
+
dataset_id,
|
|
968
|
+
source_id,
|
|
969
|
+
display_name,
|
|
970
|
+
known_usernames_json,
|
|
971
|
+
is_self,
|
|
972
|
+
last_import_source,
|
|
973
|
+
last_import_run_id,
|
|
974
|
+
last_imported_at,
|
|
975
|
+
created_at,
|
|
976
|
+
updated_at
|
|
977
|
+
)
|
|
978
|
+
VALUES (?, ?, 'global', ?, ?, 0, ?, ?, CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE NULL END, datetime('now'), datetime('now'))
|
|
979
|
+
ON CONFLICT(contact_id) DO UPDATE SET
|
|
980
|
+
display_name = COALESCE({CONTACTS_TABLE}.display_name, excluded.display_name),
|
|
981
|
+
known_usernames_json = COALESCE(excluded.known_usernames_json, {CONTACTS_TABLE}.known_usernames_json),
|
|
982
|
+
last_import_source = COALESCE(excluded.last_import_source, {CONTACTS_TABLE}.last_import_source),
|
|
983
|
+
last_import_run_id = COALESCE(excluded.last_import_run_id, {CONTACTS_TABLE}.last_import_run_id),
|
|
984
|
+
last_imported_at = CASE
|
|
985
|
+
WHEN excluded.last_import_source IS NOT NULL THEN datetime('now')
|
|
986
|
+
ELSE {CONTACTS_TABLE}.last_imported_at
|
|
987
|
+
END,
|
|
988
|
+
updated_at = datetime('now')
|
|
989
|
+
""",
|
|
990
|
+
(
|
|
991
|
+
contact_id,
|
|
992
|
+
dataset_id,
|
|
993
|
+
display_name,
|
|
994
|
+
usernames_json,
|
|
995
|
+
import_source_value,
|
|
996
|
+
import_run_id_value,
|
|
997
|
+
import_source_value,
|
|
998
|
+
),
|
|
999
|
+
)
|
|
1000
|
+
contacts_upserted += 1
|
|
1001
|
+
|
|
1002
|
+
identifier_scopes = ["*"] + scoped_sources
|
|
1003
|
+
for identifier, identifier_type in pairs:
|
|
1004
|
+
for scope_source_id in identifier_scopes:
|
|
1005
|
+
self.conn.execute(
|
|
1006
|
+
f"""
|
|
1007
|
+
INSERT INTO {CONTACT_IDENTIFIERS_TABLE}
|
|
1008
|
+
(dataset_id, source_id, identifier, identifier_type, contact_id, created_at, updated_at)
|
|
1009
|
+
VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
|
|
1010
|
+
ON CONFLICT(dataset_id, source_id, identifier) DO UPDATE SET
|
|
1011
|
+
identifier_type = COALESCE({CONTACT_IDENTIFIERS_TABLE}.identifier_type, excluded.identifier_type),
|
|
1012
|
+
contact_id = {CONTACT_IDENTIFIERS_TABLE}.contact_id,
|
|
1013
|
+
updated_at = datetime('now')
|
|
1014
|
+
""",
|
|
1015
|
+
(dataset_id, scope_source_id, identifier, identifier_type, contact_id),
|
|
1016
|
+
)
|
|
1017
|
+
identifiers_upserted += 1
|
|
1018
|
+
|
|
1019
|
+
self.conn.commit()
|
|
1020
|
+
return {"contacts_upserted": contacts_upserted, "identifiers_upserted": identifiers_upserted}
|