topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1020 @@
1
+ """Canonical tables for human-to-human conversations (messenger ingestion).
2
+
3
+ Stores thread metadata in `conversations` and message rows in `conversation_messages`.
4
+ Used when canonical_group_id="conversations" (e.g. iMessage, Signal). Not ai_chat_*.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ import hashlib
12
+ from datetime import datetime, timezone
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ logger = logging.getLogger("topos.storage.canonical.conversations_tables")
16
+
17
+ CONVERSATIONS_TABLE = "conversations"
18
+ CONVERSATION_MESSAGES_TABLE = "conversation_messages"
19
+ CONTACTS_TABLE = "contacts"
20
+ CONTACT_IDENTIFIERS_TABLE = "contact_identifiers"
21
+ CONVERSATION_PARTICIPANTS_TABLE = "conversation_participants"
22
+
23
+
24
+ def _message_timestamp_unix_for_sort(event_at: Optional[str], created_at: Optional[str]) -> float:
25
+ """
26
+ Parse message time for chronological ordering (larger = more recent).
27
+
28
+ SQLite ``ORDER BY event_at DESC`` on TEXT can mis-order values (mixed ISO shapes,
29
+ epoch strings, empty event_at with valid created_at). Callers fetch a bounded set
30
+ and re-sort in Python using this key.
31
+ """
32
+ for raw in (event_at, created_at):
33
+ if raw is None:
34
+ continue
35
+ s = str(raw).strip()
36
+ if not s:
37
+ continue
38
+ try:
39
+ iso = s
40
+ if iso.endswith("Z"):
41
+ iso = iso[:-1] + "+00:00"
42
+ dt = datetime.fromisoformat(iso)
43
+ if dt.tzinfo is None:
44
+ dt = dt.replace(tzinfo=timezone.utc)
45
+ return dt.timestamp()
46
+ except ValueError:
47
+ pass
48
+ try:
49
+ norm = s[:19].replace("T", " ")
50
+ dt = datetime.strptime(norm, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
51
+ return dt.timestamp()
52
+ except ValueError:
53
+ pass
54
+ if s.isdigit() and len(s) >= 10:
55
+ try:
56
+ val = int(s)
57
+ if val > 10_000_000_000:
58
+ val //= 1000
59
+ return float(val)
60
+ except ValueError:
61
+ pass
62
+ return 0.0
63
+
64
+
65
+ def ensure_conversations_table(conn) -> None:
66
+ """Create conversations table (thread metadata) if not exists."""
67
+ conn.execute(f"""
68
+ CREATE TABLE IF NOT EXISTS {CONVERSATIONS_TABLE} (
69
+ conversation_id TEXT NOT NULL,
70
+ dataset_id TEXT NOT NULL,
71
+ source_id TEXT,
72
+ created_at TEXT DEFAULT (datetime('now')),
73
+ updated_at TEXT DEFAULT (datetime('now')),
74
+ PRIMARY KEY (conversation_id, dataset_id)
75
+ )
76
+ """)
77
+ conn.execute(f"""
78
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATIONS_TABLE}_dataset_id
79
+ ON {CONVERSATIONS_TABLE}(dataset_id)
80
+ """)
81
+ conn.execute(f"""
82
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATIONS_TABLE}_source_id
83
+ ON {CONVERSATIONS_TABLE}(source_id)
84
+ """)
85
+ conn.commit()
86
+
87
+
88
+ def ensure_conversation_messages_table(conn) -> None:
89
+ """Create conversation_messages table (message rows) if not exists. Stage 9: event_at, is_from_self."""
90
+ conn.execute(f"""
91
+ CREATE TABLE IF NOT EXISTS {CONVERSATION_MESSAGES_TABLE} (
92
+ message_id TEXT NOT NULL PRIMARY KEY,
93
+ conversation_id TEXT NOT NULL,
94
+ dataset_id TEXT NOT NULL,
95
+ sender_type TEXT,
96
+ sender_id TEXT,
97
+ reply_to_message_id TEXT,
98
+ message_type TEXT,
99
+ event_type TEXT,
100
+ content TEXT,
101
+ event_at TEXT NOT NULL,
102
+ source_id TEXT NOT NULL,
103
+ metadata_json TEXT,
104
+ created_at TEXT DEFAULT (datetime('now')),
105
+ is_from_self INTEGER DEFAULT 0,
106
+ owner_user_id TEXT
107
+ )
108
+ """)
109
+ conn.execute(f"""
110
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_conversation_id
111
+ ON {CONVERSATION_MESSAGES_TABLE}(conversation_id)
112
+ """)
113
+ conn.execute(f"""
114
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_dataset_id
115
+ ON {CONVERSATION_MESSAGES_TABLE}(dataset_id)
116
+ """)
117
+ conn.execute(f"""
118
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_source_id
119
+ ON {CONVERSATION_MESSAGES_TABLE}(source_id)
120
+ """)
121
+ conn.execute(f"""
122
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_MESSAGES_TABLE}_event_at
123
+ ON {CONVERSATION_MESSAGES_TABLE}(event_at)
124
+ """)
125
+ conn.commit()
126
+
127
+
128
+ def ensure_contacts_table(conn) -> None:
129
+ """Create canonical contacts table if not exists."""
130
+ conn.execute(f"""
131
+ CREATE TABLE IF NOT EXISTS {CONTACTS_TABLE} (
132
+ contact_id TEXT NOT NULL PRIMARY KEY,
133
+ dataset_id TEXT NOT NULL,
134
+ source_id TEXT NOT NULL,
135
+ display_name TEXT,
136
+ known_usernames_json TEXT,
137
+ is_self INTEGER NOT NULL DEFAULT 0,
138
+ last_import_source TEXT,
139
+ last_import_run_id TEXT,
140
+ last_imported_at TEXT,
141
+ created_at TEXT DEFAULT (datetime('now')),
142
+ updated_at TEXT DEFAULT (datetime('now'))
143
+ )
144
+ """)
145
+ conn.execute(f"""
146
+ CREATE INDEX IF NOT EXISTS idx_{CONTACTS_TABLE}_dataset_source
147
+ ON {CONTACTS_TABLE}(dataset_id, source_id)
148
+ """)
149
+ conn.execute(f"""
150
+ CREATE INDEX IF NOT EXISTS idx_{CONTACTS_TABLE}_is_self
151
+ ON {CONTACTS_TABLE}(is_self)
152
+ """)
153
+ conn.commit()
154
+
155
+
156
+ def ensure_contact_identifiers_table(conn) -> None:
157
+ """Create table mapping contact identifiers (phone/email/service ids)."""
158
+ conn.execute(f"""
159
+ CREATE TABLE IF NOT EXISTS {CONTACT_IDENTIFIERS_TABLE} (
160
+ dataset_id TEXT NOT NULL,
161
+ source_id TEXT NOT NULL,
162
+ identifier TEXT NOT NULL,
163
+ identifier_type TEXT NOT NULL,
164
+ contact_id TEXT NOT NULL,
165
+ created_at TEXT DEFAULT (datetime('now')),
166
+ updated_at TEXT DEFAULT (datetime('now')),
167
+ PRIMARY KEY (dataset_id, source_id, identifier)
168
+ )
169
+ """)
170
+ conn.execute(f"""
171
+ CREATE INDEX IF NOT EXISTS idx_{CONTACT_IDENTIFIERS_TABLE}_contact
172
+ ON {CONTACT_IDENTIFIERS_TABLE}(contact_id)
173
+ """)
174
+ conn.commit()
175
+
176
+
177
+ def ensure_conversation_participants_table(conn) -> None:
178
+ """Create conversation <-> participant relationship table."""
179
+ conn.execute(f"""
180
+ CREATE TABLE IF NOT EXISTS {CONVERSATION_PARTICIPANTS_TABLE} (
181
+ conversation_id TEXT NOT NULL,
182
+ dataset_id TEXT NOT NULL,
183
+ source_id TEXT NOT NULL,
184
+ contact_id TEXT NOT NULL,
185
+ role TEXT,
186
+ created_at TEXT DEFAULT (datetime('now')),
187
+ updated_at TEXT DEFAULT (datetime('now')),
188
+ PRIMARY KEY (conversation_id, dataset_id, source_id, contact_id)
189
+ )
190
+ """)
191
+ conn.execute(f"""
192
+ CREATE INDEX IF NOT EXISTS idx_{CONVERSATION_PARTICIPANTS_TABLE}_dataset_source
193
+ ON {CONVERSATION_PARTICIPANTS_TABLE}(dataset_id, source_id)
194
+ """)
195
+ conn.commit()
196
+
197
+
198
+ def _ensure_signal_identity_columns(conn) -> None:
199
+ """Add is_from_self and owner_user_id for Signal identity. Stage 9: is_from_self (was from_self). Idempotent."""
200
+ for col, typ in (("is_from_self", "INTEGER DEFAULT 0"), ("owner_user_id", "TEXT")):
201
+ try:
202
+ conn.execute(f"ALTER TABLE {CONVERSATION_MESSAGES_TABLE} ADD COLUMN {col} {typ}")
203
+ conn.commit()
204
+ except Exception as e:
205
+ if "duplicate column" not in str(e).lower():
206
+ logger.debug("Signal identity column %s: %s", col, e)
207
+ conn.rollback()
208
+
209
+
210
+ def _ensure_reply_and_event_columns(conn) -> None:
211
+ """Add unified reply/system columns for messenger sources. Idempotent."""
212
+ for col, typ in (
213
+ ("reply_to_message_id", "TEXT"),
214
+ ("message_type", "TEXT"),
215
+ ("event_type", "TEXT"),
216
+ ):
217
+ try:
218
+ conn.execute(f"ALTER TABLE {CONVERSATION_MESSAGES_TABLE} ADD COLUMN {col} {typ}")
219
+ conn.commit()
220
+ except Exception as e:
221
+ if "duplicate column" not in str(e).lower():
222
+ logger.debug("Messenger context column %s: %s", col, e)
223
+ conn.rollback()
224
+
225
+
226
+ def _ensure_contact_provenance_columns(conn) -> None:
227
+ """Add contact import/profile columns. Idempotent."""
228
+ for col, typ in (
229
+ ("known_usernames_json", "TEXT"),
230
+ ("last_import_source", "TEXT"),
231
+ ("last_import_run_id", "TEXT"),
232
+ ("last_imported_at", "TEXT"),
233
+ ):
234
+ try:
235
+ conn.execute(f"ALTER TABLE {CONTACTS_TABLE} ADD COLUMN {col} {typ}")
236
+ conn.commit()
237
+ except Exception as e:
238
+ if "duplicate column" not in str(e).lower():
239
+ logger.debug("Contact provenance column %s: %s", col, e)
240
+ conn.rollback()
241
+
242
+
243
+ def _ensure_contact_sharing_policy_column(conn) -> None:
244
+ """Stage 11: JSON policy for name_visibility / row_visibility per contact."""
245
+ try:
246
+ conn.execute(f"ALTER TABLE {CONTACTS_TABLE} ADD COLUMN sharing_policy_json TEXT")
247
+ conn.commit()
248
+ except Exception as e:
249
+ if "duplicate column" not in str(e).lower():
250
+ logger.debug("Contact sharing_policy_json column: %s", e)
251
+ conn.rollback()
252
+
253
+
254
+ def ensure_all_tables(conn) -> None:
255
+ """Ensure both conversations and conversation_messages tables exist.
256
+ Stage 9 column renames run at engine startup (app.py) to avoid blocking the event loop during requests.
257
+ """
258
+ ensure_conversations_table(conn)
259
+ ensure_conversation_messages_table(conn)
260
+ ensure_contacts_table(conn)
261
+ ensure_contact_identifiers_table(conn)
262
+ ensure_conversation_participants_table(conn)
263
+ _ensure_reply_and_event_columns(conn)
264
+ _ensure_signal_identity_columns(conn)
265
+ _ensure_contact_provenance_columns(conn)
266
+ _ensure_contact_sharing_policy_column(conn)
267
+
268
+
269
+ class ConversationsTablesManager:
270
+ """Writes to conversations and conversation_messages only (canonical_group_id=conversations)."""
271
+
272
+ def __init__(self, conn) -> None:
273
+ self.conn = conn
274
+
275
+ def ensure_tables(self) -> None:
276
+ """Create conversations and conversation_messages tables if not exist."""
277
+ if self.conn:
278
+ ensure_all_tables(self.conn)
279
+
280
+ def upsert_conversation(
281
+ self,
282
+ conversation_id: str,
283
+ dataset_id: str,
284
+ source_id: Optional[str] = None,
285
+ ) -> None:
286
+ """Insert or replace one row in conversations."""
287
+ if not self.conn:
288
+ return
289
+ self.ensure_tables()
290
+ self.conn.execute(f"""
291
+ INSERT OR REPLACE INTO {CONVERSATIONS_TABLE}
292
+ (conversation_id, dataset_id, source_id, created_at, updated_at)
293
+ VALUES (?, ?, ?, datetime('now'), datetime('now'))
294
+ """, (conversation_id, dataset_id, source_id or ""))
295
+ self.conn.commit()
296
+
297
+ def upsert_message_batch(
298
+ self,
299
+ records: List[Dict[str, Any]],
300
+ dataset_id: str,
301
+ source_id: str,
302
+ ) -> Dict[str, int]:
303
+ """
304
+ Upsert messages into conversation_messages and ensure parent rows in conversations.
305
+ Each record must have: message_id, thread_id or conversation_id, ts, sender_type, content.
306
+ Optional: sender_id, _metadata, from_self (0/1), owner_user_id (for Signal identity).
307
+ """
308
+ if not self.conn or not records:
309
+ return {"messages_created": 0, "conversations_created": 0}
310
+ self.ensure_tables()
311
+
312
+ def _normalize_identifier_type(identifier: str) -> str:
313
+ low = (identifier or "").lower()
314
+ if low.startswith("+") or low.replace("-", "").replace(" ", "").isdigit():
315
+ return "phone"
316
+ if "@" in low:
317
+ return "email"
318
+ if ":" in low or len(low) > 24:
319
+ return "service_id"
320
+ return "handle"
321
+
322
+ def _merge_known_usernames_json(
323
+ existing_json: Optional[str],
324
+ candidates: List[str],
325
+ ) -> Optional[str]:
326
+ norm: List[str] = []
327
+ seen: set[str] = set()
328
+ try:
329
+ existing = json.loads(existing_json or "[]")
330
+ if not isinstance(existing, list):
331
+ existing = []
332
+ except Exception:
333
+ existing = []
334
+ for raw in list(existing) + list(candidates):
335
+ val = str(raw or "").strip()
336
+ if not val or len(val) > 128:
337
+ continue
338
+ low = val.lower()
339
+ if low in seen:
340
+ continue
341
+ seen.add(low)
342
+ norm.append(val)
343
+ return json.dumps(norm, ensure_ascii=False) if norm else None
344
+
345
+ def _extract_display_name(rec: Dict[str, Any]) -> Optional[str]:
346
+ metadata = rec.get("_metadata")
347
+ if isinstance(metadata, dict):
348
+ for key in (
349
+ "sender_name",
350
+ "display_name",
351
+ "contact_name",
352
+ "profileName",
353
+ "profile_name",
354
+ "quoteAuthor",
355
+ ):
356
+ val = metadata.get(key)
357
+ if isinstance(val, str) and val.strip():
358
+ return val.strip()
359
+ return None
360
+
361
+ def _extract_username_candidates(rec: Dict[str, Any], sender_identifier_type: str) -> List[str]:
362
+ out: List[str] = []
363
+ sender_id = str(rec.get("sender_id") or "").strip()
364
+ if sender_id and sender_identifier_type in {"handle", "service_id"} and "@" not in sender_id:
365
+ out.append(sender_id)
366
+ metadata = rec.get("_metadata")
367
+ if isinstance(metadata, dict):
368
+ for key in ("username", "handle", "profileName", "profile_name"):
369
+ val = metadata.get(key)
370
+ if isinstance(val, str) and val.strip():
371
+ out.append(val.strip())
372
+ return out
373
+
374
+ def _upsert_contact(rec: Dict[str, Any]) -> Optional[str]:
375
+ sender_id = str(rec.get("sender_id") or "").strip()
376
+ if not sender_id:
377
+ return None
378
+ sender_identifier_type = _normalize_identifier_type(sender_id)
379
+ is_self = 1 if (rec.get("from_self") is True or rec.get("is_from_self") is True or sender_id.lower() == "self") else 0
380
+ row = self.conn.execute(
381
+ f"""
382
+ SELECT contact_id
383
+ FROM {CONTACT_IDENTIFIERS_TABLE}
384
+ WHERE dataset_id = ?
385
+ AND identifier = ?
386
+ AND source_id IN (?, '*')
387
+ ORDER BY CASE WHEN source_id = ? THEN 0 ELSE 1 END
388
+ LIMIT 1
389
+ """,
390
+ (dataset_id, sender_id, source_id, source_id),
391
+ ).fetchone()
392
+ if row and row[0]:
393
+ contact_id = str(row[0])
394
+ else:
395
+ key = f"{sender_identifier_type}:{sender_id}"
396
+ digest = hashlib.sha1(key.encode("utf-8")).hexdigest()[:20]
397
+ contact_id = f"{dataset_id}:contact:{digest}"
398
+ display_name = _extract_display_name(rec)
399
+ usernames_json = _merge_known_usernames_json(
400
+ (
401
+ self.conn.execute(
402
+ f"SELECT known_usernames_json FROM {CONTACTS_TABLE} WHERE contact_id = ? LIMIT 1",
403
+ (contact_id,),
404
+ ).fetchone() or [None]
405
+ )[0],
406
+ _extract_username_candidates(rec, sender_identifier_type),
407
+ )
408
+ self.conn.execute(
409
+ f"""
410
+ INSERT INTO {CONTACTS_TABLE}
411
+ (contact_id, dataset_id, source_id, display_name, known_usernames_json, is_self, last_import_source, last_import_run_id, last_imported_at, created_at, updated_at)
412
+ VALUES (?, ?, 'global', ?, ?, ?, NULL, NULL, NULL, datetime('now'), datetime('now'))
413
+ ON CONFLICT(contact_id) DO UPDATE SET
414
+ display_name = COALESCE(excluded.display_name, {CONTACTS_TABLE}.display_name),
415
+ known_usernames_json = COALESCE(excluded.known_usernames_json, {CONTACTS_TABLE}.known_usernames_json),
416
+ is_self = CASE WHEN excluded.is_self = 1 THEN 1 ELSE {CONTACTS_TABLE}.is_self END,
417
+ updated_at = datetime('now')
418
+ """,
419
+ (contact_id, dataset_id, display_name, usernames_json, is_self),
420
+ )
421
+ self.conn.execute(
422
+ f"""
423
+ INSERT INTO {CONTACT_IDENTIFIERS_TABLE}
424
+ (dataset_id, source_id, identifier, identifier_type, contact_id, created_at, updated_at)
425
+ VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
426
+ ON CONFLICT(dataset_id, source_id, identifier) DO UPDATE SET
427
+ contact_id = excluded.contact_id,
428
+ updated_at = datetime('now')
429
+ """,
430
+ (
431
+ dataset_id,
432
+ source_id,
433
+ sender_id,
434
+ sender_identifier_type,
435
+ contact_id,
436
+ ),
437
+ )
438
+ return contact_id
439
+
440
+ conversations_created = 0
441
+ seen_conversation_ids: set[tuple[str, str]] = set()
442
+ participants_seen: set[tuple[str, str]] = set()
443
+ for rec in records:
444
+ conversation_id = (
445
+ str(rec.get("conversation_id") or rec.get("thread_id") or dataset_id)
446
+ )
447
+ key = (conversation_id, dataset_id)
448
+ if key not in seen_conversation_ids:
449
+ self.upsert_conversation(conversation_id, dataset_id, source_id)
450
+ seen_conversation_ids.add(key)
451
+ conversations_created += 1
452
+ contact_id = _upsert_contact(rec)
453
+ if contact_id:
454
+ part_key = (conversation_id, contact_id)
455
+ if part_key not in participants_seen:
456
+ self.conn.execute(
457
+ f"""
458
+ INSERT INTO {CONVERSATION_PARTICIPANTS_TABLE}
459
+ (conversation_id, dataset_id, source_id, contact_id, role, created_at, updated_at)
460
+ VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
461
+ ON CONFLICT(conversation_id, dataset_id, source_id, contact_id) DO UPDATE SET
462
+ role = COALESCE(excluded.role, {CONVERSATION_PARTICIPANTS_TABLE}.role),
463
+ updated_at = datetime('now')
464
+ """,
465
+ (
466
+ conversation_id,
467
+ dataset_id,
468
+ source_id,
469
+ contact_id,
470
+ "self" if (rec.get("from_self") or rec.get("is_from_self")) else "participant",
471
+ ),
472
+ )
473
+ participants_seen.add(part_key)
474
+ for rec in records:
475
+ message_id = str(rec.get("message_id") or "")
476
+ if not message_id:
477
+ continue
478
+ conversation_id = (
479
+ str(rec.get("conversation_id") or rec.get("thread_id") or dataset_id)
480
+ )
481
+ event_at = rec.get("event_at") or rec.get("ts") or ""
482
+ sender_type = rec.get("sender_type")
483
+ sender_id = rec.get("sender_id")
484
+ reply_to_message_id = rec.get("reply_to_message_id")
485
+ message_type = rec.get("message_type")
486
+ event_type = rec.get("event_type")
487
+ content = rec.get("content")
488
+ metadata_json = None
489
+ if "_metadata" in rec:
490
+ metadata_json = json.dumps(rec["_metadata"], ensure_ascii=False)
491
+ is_from_self = 1 if (rec.get("is_from_self") is True or rec.get("from_self") is True) else 0
492
+ owner_user_id = rec.get("owner_user_id")
493
+ self.conn.execute(f"""
494
+ INSERT OR REPLACE INTO {CONVERSATION_MESSAGES_TABLE}
495
+ (message_id, conversation_id, dataset_id, sender_type, sender_id, reply_to_message_id, message_type, event_type, content, event_at, source_id, metadata_json, is_from_self, owner_user_id, created_at)
496
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
497
+ """, (
498
+ message_id,
499
+ conversation_id,
500
+ dataset_id,
501
+ sender_type,
502
+ sender_id,
503
+ reply_to_message_id,
504
+ message_type,
505
+ event_type,
506
+ content,
507
+ event_at,
508
+ source_id,
509
+ metadata_json,
510
+ is_from_self,
511
+ owner_user_id,
512
+ ))
513
+ self.conn.commit()
514
+ logger.debug(
515
+ "[PIPELINE:CONVERSATIONS] Wrote %d messages to %s, %d conversation rows",
516
+ len(records),
517
+ CONVERSATION_MESSAGES_TABLE,
518
+ len(seen_conversation_ids),
519
+ )
520
+ return {"messages_created": len(records), "conversations_created": conversations_created}
521
+
522
+ def list_contacts(
523
+ self,
524
+ *,
525
+ dataset_id: str,
526
+ source_id: str,
527
+ limit: int = 200,
528
+ ) -> List[Dict[str, Any]]:
529
+ """List contacts with primary identifier and message counts for a source."""
530
+ if not self.conn:
531
+ return []
532
+ self.ensure_tables()
533
+ rows = self.conn.execute(
534
+ f"""
535
+ SELECT c.contact_id,
536
+ c.display_name,
537
+ c.known_usernames_json,
538
+ c.sharing_policy_json,
539
+ c.is_self,
540
+ c.last_import_source,
541
+ c.last_import_run_id,
542
+ c.last_imported_at,
543
+ ci.identifier AS primary_identifier,
544
+ ci.identifier_type AS primary_identifier_type,
545
+ COUNT(m.message_id) AS message_count
546
+ FROM {CONTACTS_TABLE} c
547
+ LEFT JOIN {CONTACT_IDENTIFIERS_TABLE} ci
548
+ ON ci.contact_id = c.contact_id
549
+ AND ci.dataset_id = c.dataset_id
550
+ AND ci.source_id IN (?, '*')
551
+ LEFT JOIN {CONVERSATION_MESSAGES_TABLE} m
552
+ ON m.dataset_id = c.dataset_id
553
+ AND m.source_id = ?
554
+ AND m.sender_id = ci.identifier
555
+ WHERE c.dataset_id = ?
556
+ GROUP BY c.contact_id, c.display_name, c.known_usernames_json, c.sharing_policy_json, c.is_self, c.last_import_source, c.last_import_run_id, c.last_imported_at, ci.identifier, ci.identifier_type
557
+ ORDER BY c.is_self DESC, message_count DESC, c.updated_at DESC
558
+ LIMIT ?
559
+ """,
560
+ (source_id, source_id, dataset_id, int(limit)),
561
+ ).fetchall()
562
+ out: List[Dict[str, Any]] = []
563
+ for row in rows:
564
+ try:
565
+ usernames = json.loads(row[2] or "[]")
566
+ if not isinstance(usernames, list):
567
+ usernames = []
568
+ except Exception:
569
+ usernames = []
570
+ pol_raw = row[3]
571
+ try:
572
+ sharing_policy = json.loads(pol_raw) if pol_raw else {}
573
+ if not isinstance(sharing_policy, dict):
574
+ sharing_policy = {}
575
+ except Exception:
576
+ sharing_policy = {}
577
+ out.append(
578
+ {
579
+ "contact_id": row[0],
580
+ "display_name": row[1],
581
+ "known_usernames": usernames,
582
+ "sharing_policy": sharing_policy,
583
+ "is_self": bool(row[4]),
584
+ "last_import_source": row[5],
585
+ "last_import_run_id": row[6],
586
+ "last_imported_at": row[7],
587
+ "identifier": row[8],
588
+ "identifier_type": row[9],
589
+ "message_count": int(row[10] or 0),
590
+ }
591
+ )
592
+ return out
593
+
594
+ def get_contact_message_samples(
595
+ self,
596
+ *,
597
+ dataset_id: str,
598
+ source_id: str,
599
+ identifier: str,
600
+ limit: int = 5,
601
+ ) -> List[Dict[str, Any]]:
602
+ """Return the most recent sample messages for identifier (newest first)."""
603
+ if not self.conn or not identifier:
604
+ return []
605
+ lim = max(1, int(limit))
606
+ # Pull a wider candidate set: string ORDER BY can disagree with true time order.
607
+ cap = min(500, max(60, lim * 25))
608
+ rows = self.conn.execute(
609
+ f"""
610
+ SELECT message_id, content, event_at, conversation_id, created_at
611
+ FROM {CONVERSATION_MESSAGES_TABLE}
612
+ WHERE dataset_id = ?
613
+ AND source_id = ?
614
+ AND sender_id = ?
615
+ ORDER BY event_at DESC, created_at DESC, message_id DESC
616
+ LIMIT ?
617
+ """,
618
+ (dataset_id, source_id, identifier, cap),
619
+ ).fetchall()
620
+ sorted_rows = sorted(
621
+ rows,
622
+ key=lambda r: (
623
+ _message_timestamp_unix_for_sort(
624
+ str(r[2]) if r[2] is not None else None,
625
+ str(r[4]) if len(r) > 4 and r[4] is not None else None,
626
+ ),
627
+ str(r[0] or ""),
628
+ ),
629
+ reverse=True,
630
+ )[:lim]
631
+ return [
632
+ {
633
+ "message_id": row[0],
634
+ "content": row[1],
635
+ "event_at": row[2],
636
+ "conversation_id": row[3],
637
+ }
638
+ for row in sorted_rows
639
+ ]
640
+
641
+ def get_contact_conversation_thread_previews(
642
+ self,
643
+ *,
644
+ dataset_id: str,
645
+ source_id: str,
646
+ profile_identifier: str,
647
+ max_conversations: int = 8,
648
+ messages_per_conversation: int = 45,
649
+ ) -> List[Dict[str, Any]]:
650
+ """Recent slices of full threads where ``profile_identifier`` sent at least one message.
651
+
652
+ Each block lists messages from all participants in that conversation (not only the profile
653
+ contact), ordered newest→oldest within the window (the ``messages_per_conversation`` most
654
+ recent rows in the thread).
655
+ """
656
+ if not self.conn or not profile_identifier:
657
+ return []
658
+ self.ensure_tables()
659
+ mc = max(1, int(max_conversations))
660
+ mpc = max(1, int(messages_per_conversation))
661
+
662
+ rows = self.conn.execute(
663
+ f"""
664
+ SELECT conversation_id, event_at, created_at
665
+ FROM {CONVERSATION_MESSAGES_TABLE}
666
+ WHERE dataset_id = ?
667
+ AND source_id = ?
668
+ AND sender_id = ?
669
+ """,
670
+ (dataset_id, source_id, profile_identifier),
671
+ ).fetchall()
672
+ best_ts: Dict[str, float] = {}
673
+ for conv_id, event_at, created_at in rows:
674
+ cid = str(conv_id or "").strip()
675
+ if not cid:
676
+ continue
677
+ ts = _message_timestamp_unix_for_sort(
678
+ str(event_at) if event_at is not None else None,
679
+ str(created_at) if created_at is not None else None,
680
+ )
681
+ best_ts[cid] = max(best_ts.get(cid, 0.0), ts)
682
+ sorted_conv_ids = sorted(best_ts.keys(), key=lambda c: best_ts[c], reverse=True)[:mc]
683
+
684
+ out: List[Dict[str, Any]] = []
685
+ for cid in sorted_conv_ids:
686
+ all_rows = self.conn.execute(
687
+ f"""
688
+ SELECT message_id, content, event_at, conversation_id, created_at, sender_id, is_from_self
689
+ FROM {CONVERSATION_MESSAGES_TABLE}
690
+ WHERE dataset_id = ?
691
+ AND source_id = ?
692
+ AND conversation_id = ?
693
+ """,
694
+ (dataset_id, source_id, cid),
695
+ ).fetchall()
696
+ sorted_all = sorted(
697
+ all_rows,
698
+ key=lambda r: (
699
+ _message_timestamp_unix_for_sort(
700
+ str(r[2]) if r[2] is not None else None,
701
+ str(r[4]) if len(r) > 4 and r[4] is not None else None,
702
+ ),
703
+ str(r[0] or ""),
704
+ ),
705
+ )
706
+ window = sorted_all[-mpc:] if len(sorted_all) > mpc else sorted_all
707
+ messages: List[Dict[str, Any]] = []
708
+ for r in reversed(window):
709
+ is_self = r[6]
710
+ messages.append(
711
+ {
712
+ "message_id": r[0],
713
+ "content": r[1],
714
+ "event_at": r[2],
715
+ "conversation_id": r[3],
716
+ "created_at": r[4],
717
+ "sender_id": r[5],
718
+ "is_from_self": bool(is_self) if is_self is not None else False,
719
+ }
720
+ )
721
+ out.append({"conversation_id": cid, "messages": messages})
722
+ return out
723
+
724
+ def update_contact_display_name(
725
+ self,
726
+ *,
727
+ dataset_id: str,
728
+ source_id: str,
729
+ contact_id: str,
730
+ display_name: Optional[str],
731
+ ) -> None:
732
+ """Set/clear display name for contact."""
733
+ if not self.conn:
734
+ return
735
+ self.conn.execute(
736
+ f"""
737
+ UPDATE {CONTACTS_TABLE}
738
+ SET display_name = ?, last_import_source = 'manual_edit', updated_at = datetime('now')
739
+ WHERE dataset_id = ? AND contact_id = ?
740
+ """,
741
+ ((display_name or None), dataset_id, contact_id),
742
+ )
743
+ self.conn.commit()
744
+
745
+ def update_contact_sharing_policy(
746
+ self,
747
+ *,
748
+ dataset_id: str,
749
+ contact_id: str,
750
+ sharing_policy: Optional[Dict[str, Any]],
751
+ ) -> None:
752
+ """Stage 11: set sharing_policy_json (name_visibility, row_visibility)."""
753
+ if not self.conn:
754
+ return
755
+ self.ensure_tables()
756
+ payload = json.dumps(sharing_policy or {}) if sharing_policy else None
757
+ self.conn.execute(
758
+ f"""
759
+ UPDATE {CONTACTS_TABLE}
760
+ SET sharing_policy_json = ?, updated_at = datetime('now')
761
+ WHERE dataset_id = ? AND contact_id = ?
762
+ """,
763
+ (payload, dataset_id, contact_id),
764
+ )
765
+ self.conn.commit()
766
+
767
+ def auto_resolve_contact_names(
768
+ self,
769
+ *,
770
+ dataset_id: str,
771
+ source_id: str,
772
+ ) -> int:
773
+ """Best-effort fill display_name from message metadata."""
774
+ if not self.conn:
775
+ return 0
776
+ candidates = self.conn.execute(
777
+ f"""
778
+ SELECT sender_id, metadata_json
779
+ FROM {CONVERSATION_MESSAGES_TABLE}
780
+ WHERE dataset_id = ?
781
+ AND source_id = ?
782
+ AND sender_id IS NOT NULL
783
+ AND sender_id != ''
784
+ AND metadata_json IS NOT NULL
785
+ ORDER BY event_at DESC
786
+ """,
787
+ (dataset_id, source_id),
788
+ ).fetchall()
789
+ updated = 0
790
+ seen: set[str] = set()
791
+ for sender_id, metadata_json in candidates:
792
+ sid = str(sender_id or "").strip()
793
+ if not sid or sid in seen:
794
+ continue
795
+ seen.add(sid)
796
+ try:
797
+ md = json.loads(metadata_json or "{}")
798
+ except Exception:
799
+ continue
800
+ display_name = None
801
+ if isinstance(md, dict):
802
+ for key in (
803
+ "sender_name",
804
+ "display_name",
805
+ "contact_name",
806
+ "profileName",
807
+ "profile_name",
808
+ "quoteAuthor",
809
+ ):
810
+ val = md.get(key)
811
+ if isinstance(val, str) and val.strip():
812
+ display_name = val.strip()
813
+ break
814
+ if not display_name:
815
+ continue
816
+ row = self.conn.execute(
817
+ f"""
818
+ SELECT contact_id
819
+ FROM {CONTACT_IDENTIFIERS_TABLE}
820
+ WHERE dataset_id = ?
821
+ AND identifier = ?
822
+ AND source_id IN (?, '*')
823
+ ORDER BY CASE WHEN source_id = ? THEN 0 ELSE 1 END
824
+ LIMIT 1
825
+ """,
826
+ (dataset_id, sid, source_id, source_id),
827
+ ).fetchone()
828
+ if not row or not row[0]:
829
+ continue
830
+ contact_id = str(row[0])
831
+ cursor = self.conn.execute(
832
+ f"""
833
+ UPDATE {CONTACTS_TABLE}
834
+ SET display_name = COALESCE(display_name, ?),
835
+ last_import_source = CASE WHEN display_name IS NULL THEN 'auto_resolve' ELSE last_import_source END,
836
+ updated_at = datetime('now')
837
+ WHERE dataset_id = ? AND contact_id = ?
838
+ """,
839
+ (display_name, dataset_id, contact_id),
840
+ )
841
+ if int(cursor.rowcount or 0) > 0:
842
+ updated += 1
843
+ if updated:
844
+ self.conn.commit()
845
+ return updated
846
+
847
+ def import_contacts_batch(
848
+ self,
849
+ *,
850
+ dataset_id: str,
851
+ contacts: List[Dict[str, Any]],
852
+ source_id: Optional[str] = None,
853
+ target_sources: Optional[List[str]] = None,
854
+ import_source: Optional[str] = None,
855
+ import_run_id: Optional[str] = None,
856
+ ) -> Dict[str, int]:
857
+ """
858
+ Upsert external contacts into canonical contacts tables.
859
+ Does not overwrite existing non-null display names.
860
+ """
861
+ if not self.conn or not contacts:
862
+ return {"contacts_upserted": 0, "identifiers_upserted": 0}
863
+ self.ensure_tables()
864
+ import_source_value = str(import_source or "").strip() or None
865
+ import_run_id_value = str(import_run_id or "").strip() or None
866
+ scoped_sources = sorted(
867
+ {
868
+ s
869
+ for s in ([str(source_id or "").strip()] + [str(s or "").strip() for s in (target_sources or [])])
870
+ if s
871
+ }
872
+ )
873
+
874
+ def _normalize_identifier_type(identifier: str, explicit_type: Optional[str]) -> str:
875
+ if explicit_type and str(explicit_type).strip():
876
+ return str(explicit_type).strip().lower()
877
+ low = (identifier or "").lower()
878
+ if low.startswith("+") or low.replace("-", "").replace(" ", "").isdigit():
879
+ return "phone"
880
+ if "@" in low:
881
+ return "email"
882
+ if ":" in low or len(low) > 24:
883
+ return "service_id"
884
+ return "handle"
885
+
886
+ def _merge_known_usernames_json(existing_json: Optional[str], candidates: List[str]) -> Optional[str]:
887
+ merged: List[str] = []
888
+ seen: set[str] = set()
889
+ try:
890
+ existing = json.loads(existing_json or "[]")
891
+ if not isinstance(existing, list):
892
+ existing = []
893
+ except Exception:
894
+ existing = []
895
+ for raw in list(existing) + list(candidates):
896
+ val = str(raw or "").strip()
897
+ if not val or len(val) > 128:
898
+ continue
899
+ low = val.lower()
900
+ if low in seen:
901
+ continue
902
+ seen.add(low)
903
+ merged.append(val)
904
+ return json.dumps(merged, ensure_ascii=False) if merged else None
905
+
906
+ contacts_upserted = 0
907
+ identifiers_upserted = 0
908
+ for rec in contacts:
909
+ identifiers_raw = rec.get("identifiers") or []
910
+ pairs: List[tuple[str, str]] = []
911
+ for item in identifiers_raw:
912
+ if isinstance(item, dict):
913
+ identifier = str(item.get("identifier") or "").strip()
914
+ itype = _normalize_identifier_type(identifier, item.get("type"))
915
+ else:
916
+ identifier = str(item or "").strip()
917
+ itype = _normalize_identifier_type(identifier, None)
918
+ if identifier:
919
+ pairs.append((identifier, itype))
920
+ if not pairs:
921
+ continue
922
+
923
+ # Prefer existing contact mapping for any known identifier.
924
+ contact_id = None
925
+ for identifier, _ in pairs:
926
+ row = self.conn.execute(
927
+ f"""
928
+ SELECT contact_id FROM {CONTACT_IDENTIFIERS_TABLE}
929
+ WHERE dataset_id = ? AND identifier = ?
930
+ LIMIT 1
931
+ """,
932
+ (dataset_id, identifier),
933
+ ).fetchone()
934
+ if row and row[0]:
935
+ contact_id = str(row[0])
936
+ break
937
+ if not contact_id:
938
+ key = "|".join(sorted({f"{t}:{i}" for i, t in pairs}))
939
+ digest = hashlib.sha1(key.encode("utf-8")).hexdigest()[:20]
940
+ contact_id = f"{dataset_id}:contact:import:{digest}"
941
+ display_name = rec.get("display_name")
942
+ if display_name is not None:
943
+ display_name = str(display_name).strip() or None
944
+ username_candidates = [
945
+ i for i, t in pairs
946
+ if t in {"handle", "service_id"} and "@" not in i and len(i) <= 128
947
+ ]
948
+ existing_profile = self.conn.execute(
949
+ f"""
950
+ SELECT known_usernames_json
951
+ FROM {CONTACTS_TABLE}
952
+ WHERE contact_id = ?
953
+ LIMIT 1
954
+ """,
955
+ (contact_id,),
956
+ ).fetchone()
957
+ usernames_json = _merge_known_usernames_json(
958
+ existing_profile[0] if existing_profile else None,
959
+ username_candidates,
960
+ )
961
+
962
+ self.conn.execute(
963
+ f"""
964
+ INSERT INTO {CONTACTS_TABLE}
965
+ (
966
+ contact_id,
967
+ dataset_id,
968
+ source_id,
969
+ display_name,
970
+ known_usernames_json,
971
+ is_self,
972
+ last_import_source,
973
+ last_import_run_id,
974
+ last_imported_at,
975
+ created_at,
976
+ updated_at
977
+ )
978
+ VALUES (?, ?, 'global', ?, ?, 0, ?, ?, CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE NULL END, datetime('now'), datetime('now'))
979
+ ON CONFLICT(contact_id) DO UPDATE SET
980
+ display_name = COALESCE({CONTACTS_TABLE}.display_name, excluded.display_name),
981
+ known_usernames_json = COALESCE(excluded.known_usernames_json, {CONTACTS_TABLE}.known_usernames_json),
982
+ last_import_source = COALESCE(excluded.last_import_source, {CONTACTS_TABLE}.last_import_source),
983
+ last_import_run_id = COALESCE(excluded.last_import_run_id, {CONTACTS_TABLE}.last_import_run_id),
984
+ last_imported_at = CASE
985
+ WHEN excluded.last_import_source IS NOT NULL THEN datetime('now')
986
+ ELSE {CONTACTS_TABLE}.last_imported_at
987
+ END,
988
+ updated_at = datetime('now')
989
+ """,
990
+ (
991
+ contact_id,
992
+ dataset_id,
993
+ display_name,
994
+ usernames_json,
995
+ import_source_value,
996
+ import_run_id_value,
997
+ import_source_value,
998
+ ),
999
+ )
1000
+ contacts_upserted += 1
1001
+
1002
+ identifier_scopes = ["*"] + scoped_sources
1003
+ for identifier, identifier_type in pairs:
1004
+ for scope_source_id in identifier_scopes:
1005
+ self.conn.execute(
1006
+ f"""
1007
+ INSERT INTO {CONTACT_IDENTIFIERS_TABLE}
1008
+ (dataset_id, source_id, identifier, identifier_type, contact_id, created_at, updated_at)
1009
+ VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))
1010
+ ON CONFLICT(dataset_id, source_id, identifier) DO UPDATE SET
1011
+ identifier_type = COALESCE({CONTACT_IDENTIFIERS_TABLE}.identifier_type, excluded.identifier_type),
1012
+ contact_id = {CONTACT_IDENTIFIERS_TABLE}.contact_id,
1013
+ updated_at = datetime('now')
1014
+ """,
1015
+ (dataset_id, scope_source_id, identifier, identifier_type, contact_id),
1016
+ )
1017
+ identifiers_upserted += 1
1018
+
1019
+ self.conn.commit()
1020
+ return {"contacts_upserted": contacts_upserted, "identifiers_upserted": identifiers_upserted}