topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,522 @@
1
+ """Messenger graph extraction from canonical conversation tables (Sprint 01).
2
+
3
+ Nodes are strictly chat participants from canonical membership/senders.
4
+ Edges combine:
5
+ - co-participation in conversations
6
+ - direct links from reply-to relationships
7
+ - direct links from @mentions in message content (only when mention resolves to a participant)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import re
14
+ from collections import Counter, defaultdict
15
+ from datetime import datetime
16
+ from itertools import combinations
17
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
18
+
19
+
20
+ MENTION_PATTERN = re.compile(r"(?<!\w)@([A-Za-z0-9_.+\-]{2,64})")
21
+
22
+
23
+ SOURCE_DIRECT_LINK_INVESTIGATION: Dict[str, Dict[str, Any]] = {
24
+ "imessage": {
25
+ "reply_fields": [
26
+ "conversation_messages.reply_to_message_id",
27
+ "conversation_messages.metadata_json.thread_originator_guid",
28
+ "conversation_messages.metadata_json.associated_message_guid",
29
+ ],
30
+ "mention_fields": [
31
+ "message content @token (regex extraction)",
32
+ ],
33
+ "notes": (
34
+ "iMessage ingestion maps thread-originator/associated context into "
35
+ "reply_to_message_id and metadata_json. No structured mention field is "
36
+ "currently ingested; mentions are extracted from content."
37
+ ),
38
+ },
39
+ "signal": {
40
+ "reply_fields": [
41
+ "conversation_messages.reply_to_message_id",
42
+ "conversation_messages.metadata_json.quoteId",
43
+ "conversation_messages.metadata_json.quotedMessageId",
44
+ "conversation_messages.metadata_json.replyToMessageId",
45
+ ],
46
+ "mention_fields": [
47
+ "message content @token (regex extraction)",
48
+ "metadata_json may include quoteAuthor* fields (used as context only)",
49
+ ],
50
+ "notes": (
51
+ "Signal ingestion resolves quote/reply context into reply_to_message_id "
52
+ "when possible. No canonical structured @mention list is currently stored."
53
+ ),
54
+ },
55
+ "whatsapp": {
56
+ "reply_fields": [
57
+ "not implemented yet",
58
+ ],
59
+ "mention_fields": [
60
+ "not implemented yet",
61
+ ],
62
+ "notes": "Reserved for future source integration.",
63
+ },
64
+ }
65
+
66
+
67
+ def _parse_ts(value: str) -> Optional[datetime]:
68
+ if not value:
69
+ return None
70
+ text = str(value).strip()
71
+ if not text:
72
+ return None
73
+ try:
74
+ return datetime.fromisoformat(text.replace("Z", "+00:00"))
75
+ except ValueError:
76
+ return None
77
+
78
+
79
+ def _period_key(ts: str, granularity: str) -> str:
80
+ dt = _parse_ts(ts)
81
+ if dt is None:
82
+ return "unknown"
83
+ if granularity == "quarter":
84
+ quarter = ((dt.month - 1) // 3) + 1
85
+ return f"{dt.year}-Q{quarter}"
86
+ if granularity == "year":
87
+ return f"{dt.year}"
88
+ return f"{dt.year:04d}-{dt.month:02d}"
89
+
90
+
91
+ def _extract_mentions(content: Optional[str]) -> Set[str]:
92
+ if not content:
93
+ return set()
94
+ return {m.group(1).lower() for m in MENTION_PATTERN.finditer(content)}
95
+
96
+
97
+ def _normalize_source_ids(source_ids: Optional[Sequence[str]]) -> Optional[List[str]]:
98
+ if not source_ids:
99
+ return None
100
+ norm = sorted({str(s).strip() for s in source_ids if str(s).strip()})
101
+ return norm or None
102
+
103
+
104
+ def _sql_in_clause(values: Sequence[str]) -> Tuple[str, List[str]]:
105
+ placeholders = ",".join(["?"] * len(values))
106
+ return f"({placeholders})", list(values)
107
+
108
+
109
+ def _rows_to_dicts(cursor_rows: Iterable[Any]) -> List[Dict[str, Any]]:
110
+ out: List[Dict[str, Any]] = []
111
+ for row in cursor_rows:
112
+ if isinstance(row, dict):
113
+ out.append(dict(row))
114
+ continue
115
+ if hasattr(row, "keys"):
116
+ out.append({k: row[k] for k in row.keys()})
117
+ continue
118
+ raise TypeError("Expected sqlite Row/dict rows; ensure connection.row_factory is set")
119
+ return out
120
+
121
+
122
+ def _load_contact_profiles(
123
+ conn: Any,
124
+ *,
125
+ dataset_id: str,
126
+ ) -> Dict[str, Dict[str, Any]]:
127
+ rows = conn.execute(
128
+ """
129
+ SELECT contact_id, display_name, known_usernames_json
130
+ FROM contacts
131
+ WHERE dataset_id = ?
132
+ """,
133
+ (dataset_id,),
134
+ ).fetchall()
135
+ profiles: Dict[str, Dict[str, Any]] = {}
136
+ for row in _rows_to_dicts(rows):
137
+ known_usernames_raw = row.get("known_usernames_json")
138
+ known_usernames: List[str] = []
139
+ if isinstance(known_usernames_raw, str) and known_usernames_raw.strip():
140
+ try:
141
+ parsed = json.loads(known_usernames_raw)
142
+ if isinstance(parsed, list):
143
+ known_usernames = [str(v).strip() for v in parsed if str(v).strip()]
144
+ except Exception:
145
+ known_usernames = []
146
+ profiles[str(row["contact_id"])] = {
147
+ "display_name": row.get("display_name"),
148
+ "known_usernames": known_usernames,
149
+ }
150
+ return profiles
151
+
152
+
153
+ def _load_contact_identifiers(
154
+ conn: Any,
155
+ *,
156
+ dataset_id: str,
157
+ source_ids: Optional[Sequence[str]],
158
+ ) -> Dict[str, Set[str]]:
159
+ params: List[Any] = [dataset_id]
160
+ where = "dataset_id = ?"
161
+ if source_ids:
162
+ in_clause, in_params = _sql_in_clause(source_ids)
163
+ where = f"({where} AND source_id IN {in_clause}) OR (dataset_id = ? AND source_id = '*')"
164
+ params.extend(in_params)
165
+ params.append(dataset_id)
166
+ rows = conn.execute(
167
+ f"""
168
+ SELECT contact_id, identifier
169
+ FROM contact_identifiers
170
+ WHERE {where}
171
+ """,
172
+ tuple(params),
173
+ ).fetchall()
174
+ aliases: Dict[str, Set[str]] = defaultdict(set)
175
+ for row in _rows_to_dicts(rows):
176
+ contact_id = str(row.get("contact_id") or "").strip()
177
+ identifier = str(row.get("identifier") or "").strip()
178
+ if contact_id and identifier:
179
+ aliases[contact_id].add(identifier.lower())
180
+ return aliases
181
+
182
+
183
+ def _participant_aliases_for_contacts(
184
+ *,
185
+ contact_ids: Iterable[str],
186
+ contact_profiles: Dict[str, Dict[str, Any]],
187
+ contact_identifiers: Dict[str, Set[str]],
188
+ ) -> Dict[str, Set[str]]:
189
+ aliases_by_contact: Dict[str, Set[str]] = {}
190
+ for contact_id in contact_ids:
191
+ aliases: Set[str] = set()
192
+ profile = contact_profiles.get(contact_id) or {}
193
+ display_name = str(profile.get("display_name") or "").strip()
194
+ if display_name:
195
+ aliases.add(display_name.lower())
196
+ aliases.add(display_name.replace(" ", "").lower())
197
+ for username in profile.get("known_usernames", []) or []:
198
+ uname = str(username).strip().lower()
199
+ if uname:
200
+ aliases.add(uname)
201
+ for identifier in contact_identifiers.get(contact_id, set()):
202
+ aliases.add(identifier.lower())
203
+ aliases_by_contact[contact_id] = {a for a in aliases if a}
204
+ return aliases_by_contact
205
+
206
+
207
+ def _load_contact_id_lookup_by_identifier(
208
+ conn: Any,
209
+ *,
210
+ dataset_id: str,
211
+ source_ids: Optional[Sequence[str]],
212
+ ) -> Dict[Tuple[str, str], str]:
213
+ params: List[Any] = [dataset_id]
214
+ where = ["dataset_id = ?"]
215
+ if source_ids:
216
+ in_clause, in_params = _sql_in_clause(source_ids)
217
+ where.append(f"(source_id IN {in_clause} OR source_id = '*')")
218
+ params.extend(in_params)
219
+ rows = _rows_to_dicts(
220
+ conn.execute(
221
+ f"""
222
+ SELECT source_id, identifier, contact_id
223
+ FROM contact_identifiers
224
+ WHERE {" AND ".join(where)}
225
+ """,
226
+ tuple(params),
227
+ ).fetchall()
228
+ )
229
+ lookup: Dict[Tuple[str, str], str] = {}
230
+ for row in rows:
231
+ src = str(row.get("source_id") or "").strip()
232
+ identifier = str(row.get("identifier") or "").strip()
233
+ contact_id = str(row.get("contact_id") or "").strip()
234
+ if src and identifier and contact_id:
235
+ lookup[(src, identifier)] = contact_id
236
+ return lookup
237
+
238
+
239
+ def _build_unique_alias_lookup(aliases_by_contact: Dict[str, Set[str]]) -> Dict[str, str]:
240
+ collisions: Dict[str, Set[str]] = defaultdict(set)
241
+ for contact_id, aliases in aliases_by_contact.items():
242
+ for alias in aliases:
243
+ collisions[alias].add(contact_id)
244
+ return {
245
+ alias: next(iter(contact_ids))
246
+ for alias, contact_ids in collisions.items()
247
+ if len(contact_ids) == 1
248
+ }
249
+
250
+
251
+ def extract_messenger_graph(
252
+ *,
253
+ dataset_id: str,
254
+ conn: Optional[Any] = None,
255
+ start_ts: Optional[str] = None,
256
+ end_ts: Optional[str] = None,
257
+ source_ids: Optional[Sequence[str]] = None,
258
+ period_granularity: str = "month",
259
+ cumulative: bool = False,
260
+ ) -> Dict[str, Any]:
261
+ """Extract messenger graph nodes/edges per period from canonical tables.
262
+
263
+ Returns:
264
+ {
265
+ "period_granularity": "month|quarter|year",
266
+ "source_ids": [...],
267
+ "periods": [
268
+ {
269
+ "period_key": "YYYY-MM",
270
+ "nodes": [{"id", "label", "source_ids"}],
271
+ "edges": [{"source","target","weight","edge_type","edge_type_counts"}],
272
+ },
273
+ ],
274
+ "investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
275
+ }
276
+ """
277
+ if not dataset_id:
278
+ raise ValueError("dataset_id is required")
279
+ if period_granularity not in {"month", "quarter", "year"}:
280
+ raise ValueError("period_granularity must be one of: month, quarter, year")
281
+
282
+ if conn is not None:
283
+ db = conn
284
+ else:
285
+ from ..core.state import get_db_connection
286
+
287
+ db = get_db_connection()
288
+ if db is None:
289
+ raise RuntimeError("Database connection not available")
290
+
291
+ normalized_sources = _normalize_source_ids(source_ids)
292
+ query_params: List[Any] = [dataset_id]
293
+ where = ["m.dataset_id = ?"]
294
+ if start_ts:
295
+ where.append("m.event_at >= ?")
296
+ query_params.append(start_ts)
297
+ if end_ts:
298
+ where.append("m.event_at <= ?")
299
+ query_params.append(end_ts)
300
+ if normalized_sources:
301
+ in_clause, in_params = _sql_in_clause(normalized_sources)
302
+ where.append(f"m.source_id IN {in_clause}")
303
+ query_params.extend(in_params)
304
+
305
+ rows = db.execute(
306
+ f"""
307
+ SELECT
308
+ m.message_id,
309
+ m.conversation_id,
310
+ m.source_id,
311
+ m.sender_id,
312
+ m.reply_to_message_id,
313
+ m.content,
314
+ m.event_at
315
+ FROM conversation_messages m
316
+ WHERE {" AND ".join(where)}
317
+ ORDER BY m.event_at ASC, m.message_id ASC
318
+ """,
319
+ tuple(query_params),
320
+ ).fetchall()
321
+ message_rows = _rows_to_dicts(rows)
322
+ if not message_rows:
323
+ return {
324
+ "period_granularity": period_granularity,
325
+ "source_ids": normalized_sources or [],
326
+ "periods": [],
327
+ "investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
328
+ }
329
+
330
+ contact_lookup = _load_contact_id_lookup_by_identifier(
331
+ db,
332
+ dataset_id=dataset_id,
333
+ source_ids=normalized_sources,
334
+ )
335
+
336
+ messages_by_period: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
337
+ conversation_keys_by_period: Dict[str, Set[Tuple[str, str]]] = defaultdict(set)
338
+ global_message_sender: Dict[str, str] = {}
339
+
340
+ for row in message_rows:
341
+ sender_id = str(row.get("sender_id") or "").strip()
342
+ source_id = str(row.get("source_id") or "").strip()
343
+ sender_contact_id = ""
344
+ if sender_id and source_id:
345
+ sender_contact_id = contact_lookup.get((source_id, sender_id), "") or contact_lookup.get(("*", sender_id), "")
346
+ row["sender_contact_id"] = sender_contact_id
347
+
348
+ period_key = _period_key(str(row.get("event_at") or ""), period_granularity)
349
+ messages_by_period[period_key].append(row)
350
+ conv_key = (str(row.get("conversation_id") or ""), str(row.get("source_id") or ""))
351
+ conversation_keys_by_period[period_key].add(conv_key)
352
+ message_id = str(row.get("message_id") or "")
353
+ sender_contact_id = str(row.get("sender_contact_id") or "")
354
+ if message_id and sender_contact_id:
355
+ global_message_sender[message_id] = sender_contact_id
356
+
357
+ cp_params: List[Any] = [dataset_id]
358
+ cp_where = ["dataset_id = ?"]
359
+ if normalized_sources:
360
+ in_clause, in_params = _sql_in_clause(normalized_sources)
361
+ cp_where.append(f"source_id IN {in_clause}")
362
+ cp_params.extend(in_params)
363
+ participant_rows = _rows_to_dicts(
364
+ db.execute(
365
+ f"""
366
+ SELECT conversation_id, source_id, contact_id
367
+ FROM conversation_participants
368
+ WHERE {" AND ".join(cp_where)}
369
+ """,
370
+ tuple(cp_params),
371
+ ).fetchall()
372
+ )
373
+
374
+ participants_by_conversation: Dict[Tuple[str, str], Set[str]] = defaultdict(set)
375
+ for row in participant_rows:
376
+ conv_key = (str(row.get("conversation_id") or ""), str(row.get("source_id") or ""))
377
+ contact_id = str(row.get("contact_id") or "")
378
+ if contact_id:
379
+ participants_by_conversation[conv_key].add(contact_id)
380
+
381
+ contact_profiles = _load_contact_profiles(db, dataset_id=dataset_id)
382
+ contact_identifiers = _load_contact_identifiers(
383
+ db,
384
+ dataset_id=dataset_id,
385
+ source_ids=normalized_sources,
386
+ )
387
+
388
+ ordered_periods = sorted(messages_by_period.keys())
389
+ period_payloads: List[Dict[str, Any]] = []
390
+ cumulative_messages: List[Dict[str, Any]] = []
391
+ cumulative_conv_keys: Set[Tuple[str, str]] = set()
392
+
393
+ for period_key in ordered_periods:
394
+ current_messages = messages_by_period[period_key]
395
+ current_conv_keys = conversation_keys_by_period[period_key]
396
+ if cumulative:
397
+ cumulative_messages.extend(current_messages)
398
+ cumulative_conv_keys |= current_conv_keys
399
+ period_messages = cumulative_messages
400
+ period_conv_keys = cumulative_conv_keys
401
+ else:
402
+ period_messages = current_messages
403
+ period_conv_keys = current_conv_keys
404
+
405
+ period_participants_by_conv: Dict[Tuple[str, str], Set[str]] = {}
406
+ for conv_key in period_conv_keys:
407
+ base = set(participants_by_conversation.get(conv_key, set()))
408
+ period_participants_by_conv[conv_key] = base
409
+
410
+ participant_ids: Set[str] = set()
411
+ node_sources: Dict[str, Set[str]] = defaultdict(set)
412
+
413
+ for conv_key, contact_ids in period_participants_by_conv.items():
414
+ src = conv_key[1]
415
+ for contact_id in contact_ids:
416
+ participant_ids.add(contact_id)
417
+ if src:
418
+ node_sources[contact_id].add(src)
419
+
420
+ for msg in period_messages:
421
+ contact_id = str(msg.get("sender_contact_id") or "").strip()
422
+ conv_key = (str(msg.get("conversation_id") or ""), str(msg.get("source_id") or ""))
423
+ if not contact_id:
424
+ continue
425
+ participant_ids.add(contact_id)
426
+ if conv_key[1]:
427
+ node_sources[contact_id].add(conv_key[1])
428
+ period_participants_by_conv.setdefault(conv_key, set()).add(contact_id)
429
+
430
+ aliases_by_contact = _participant_aliases_for_contacts(
431
+ contact_ids=participant_ids,
432
+ contact_profiles=contact_profiles,
433
+ contact_identifiers=contact_identifiers,
434
+ )
435
+
436
+ co_edges: Counter[Tuple[str, str]] = Counter()
437
+ reply_edges: Counter[Tuple[str, str]] = Counter()
438
+ mention_edges: Counter[Tuple[str, str]] = Counter()
439
+
440
+ for conv_key, members in period_participants_by_conv.items():
441
+ sorted_members = sorted(members)
442
+ for src_id, tgt_id in combinations(sorted_members, 2):
443
+ if src_id and tgt_id:
444
+ co_edges[(src_id, tgt_id)] += 1
445
+
446
+ conv_alias_lookup = _build_unique_alias_lookup(
447
+ {cid: aliases_by_contact.get(cid, set()) for cid in members}
448
+ )
449
+ conv_messages = [
450
+ m
451
+ for m in period_messages
452
+ if (str(m.get("conversation_id") or ""), str(m.get("source_id") or "")) == conv_key
453
+ ]
454
+ for msg in conv_messages:
455
+ sender_id = str(msg.get("sender_contact_id") or "").strip()
456
+ if not sender_id:
457
+ continue
458
+
459
+ reply_to_message_id = str(msg.get("reply_to_message_id") or "").strip()
460
+ if reply_to_message_id:
461
+ target_id = global_message_sender.get(reply_to_message_id)
462
+ if target_id and target_id in members and target_id != sender_id:
463
+ edge = tuple(sorted((sender_id, target_id)))
464
+ reply_edges[edge] += 1
465
+
466
+ for mention in _extract_mentions(msg.get("content")):
467
+ target_id = conv_alias_lookup.get(mention)
468
+ if target_id and target_id != sender_id:
469
+ edge = tuple(sorted((sender_id, target_id)))
470
+ mention_edges[edge] += 1
471
+
472
+ all_edges = set(co_edges.keys()) | set(reply_edges.keys()) | set(mention_edges.keys())
473
+ edges_payload: List[Dict[str, Any]] = []
474
+ for src_id, tgt_id in sorted(all_edges):
475
+ edge_type_counts: Dict[str, int] = {}
476
+ if co_edges.get((src_id, tgt_id), 0):
477
+ edge_type_counts["co_participation"] = int(co_edges[(src_id, tgt_id)])
478
+ if reply_edges.get((src_id, tgt_id), 0):
479
+ edge_type_counts["direct_reply"] = int(reply_edges[(src_id, tgt_id)])
480
+ if mention_edges.get((src_id, tgt_id), 0):
481
+ edge_type_counts["direct_mention"] = int(mention_edges[(src_id, tgt_id)])
482
+ total_weight = sum(edge_type_counts.values())
483
+ if len(edge_type_counts) == 1:
484
+ edge_type = next(iter(edge_type_counts.keys()))
485
+ else:
486
+ edge_type = "mixed"
487
+ edges_payload.append(
488
+ {
489
+ "source": src_id,
490
+ "target": tgt_id,
491
+ "weight": total_weight,
492
+ "edge_type": edge_type,
493
+ "edge_type_counts": edge_type_counts,
494
+ }
495
+ )
496
+
497
+ nodes_payload: List[Dict[str, Any]] = []
498
+ for contact_id in sorted(participant_ids):
499
+ profile = contact_profiles.get(contact_id) or {}
500
+ label = str(profile.get("display_name") or "").strip() or contact_id
501
+ nodes_payload.append(
502
+ {
503
+ "id": contact_id,
504
+ "label": label,
505
+ "source_ids": sorted(node_sources.get(contact_id, set())),
506
+ }
507
+ )
508
+
509
+ period_payloads.append(
510
+ {
511
+ "period_key": period_key,
512
+ "nodes": nodes_payload,
513
+ "edges": edges_payload,
514
+ }
515
+ )
516
+
517
+ return {
518
+ "period_granularity": period_granularity,
519
+ "source_ids": normalized_sources or [],
520
+ "periods": period_payloads,
521
+ "investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
522
+ }