topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,321 @@
1
+ """Helpers for resolving participant labels in messenger analytics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import defaultdict
6
+ from typing import Any, Dict, List, Sequence, Set, Tuple
7
+
8
+
9
+ def _rows_to_dicts(rows: Sequence[Any], cursor: Any = None) -> List[Dict[str, Any]]:
10
+ """Map DB rows to dicts. Plain sqlite3 connections return tuples; use ``cursor.description``."""
11
+ out: List[Dict[str, Any]] = []
12
+ col_names: List[str] | None = None
13
+ if cursor is not None and getattr(cursor, "description", None):
14
+ col_names = [d[0] for d in cursor.description if d is not None]
15
+ for row in rows:
16
+ if hasattr(row, "keys"):
17
+ out.append({k: row[k] for k in row.keys()})
18
+ elif col_names is not None and isinstance(row, (tuple, list)) and len(row) == len(col_names):
19
+ out.append({col_names[i]: row[i] for i in range(len(col_names))})
20
+ else:
21
+ out.append(dict(row))
22
+ return out
23
+
24
+
25
+ def _in_clause(values: Sequence[str]) -> tuple[str, List[str]]:
26
+ placeholders = ",".join(["?"] * len(values))
27
+ return f"({placeholders})", list(values)
28
+
29
+
30
+ def _normalize_contact_key(value: Any) -> str:
31
+ s = str(value or "").strip()
32
+ if not s:
33
+ return ""
34
+ low = s.lower()
35
+ if low == "self":
36
+ return "self"
37
+ if "@" in low:
38
+ return low
39
+ digits = "".join(ch for ch in s if ch.isdigit())
40
+ if digits:
41
+ return f"+{digits}" if s.startswith("+") else digits
42
+ return low
43
+
44
+
45
+ def sender_matches_focus_identifier(sender_id: str, profile_identifier: str) -> bool:
46
+ """True if message ``sender_id`` refers to the same party as the profile row's primary identifier."""
47
+ a = str(sender_id or "").strip()
48
+ b = str(profile_identifier or "").strip()
49
+ if not a or not b:
50
+ return False
51
+ if _identifier_candidates(a) & _identifier_candidates(b):
52
+ return True
53
+ na, nb = _normalize_contact_key(a), _normalize_contact_key(b)
54
+ return bool(na and nb and na == nb)
55
+
56
+
57
+ def _identifier_candidates(value: str) -> Set[str]:
58
+ raw = str(value or "").strip()
59
+ if not raw:
60
+ return set()
61
+ out = {raw, raw.lower()}
62
+ normalized = _normalize_contact_key(raw)
63
+ if normalized:
64
+ out.add(normalized)
65
+ digits = "".join(ch for ch in raw if ch.isdigit())
66
+ if digits:
67
+ out.add(digits)
68
+ out.add(f"+{digits}")
69
+ # Common NANP variant: some imports drop leading country code 1.
70
+ if len(digits) == 11 and digits.startswith("1"):
71
+ local10 = digits[1:]
72
+ out.add(local10)
73
+ out.add(f"+{local10}")
74
+ return {v for v in out if v}
75
+
76
+
77
+ def resolve_participant_labels(
78
+ conn: Any,
79
+ *,
80
+ dataset_id: str,
81
+ participant_ids: Sequence[str],
82
+ ) -> Dict[str, Dict[str, str]]:
83
+ """Resolve display labels for participant contact IDs.
84
+
85
+ Priority:
86
+ 1) contacts.display_name
87
+ 2) a contact identifier from contact_identifiers
88
+ 3) raw participant_id
89
+ """
90
+ normalized_participants = sorted({str(pid).strip() for pid in participant_ids if str(pid).strip()})
91
+ if not normalized_participants:
92
+ return {}
93
+
94
+ contacts_in_clause, contacts_params = _in_clause(normalized_participants)
95
+ participant_candidates: Dict[str, Set[str]] = {
96
+ participant_id: _identifier_candidates(participant_id)
97
+ for participant_id in normalized_participants
98
+ }
99
+ all_identifier_candidates = sorted({cand for cands in participant_candidates.values() for cand in cands})
100
+
101
+ _cur_contacts = conn.execute(
102
+ f"""
103
+ SELECT contact_id, display_name
104
+ FROM contacts
105
+ WHERE dataset_id = ? AND contact_id IN {contacts_in_clause}
106
+ """,
107
+ tuple([dataset_id] + contacts_params),
108
+ )
109
+ contacts_rows = _rows_to_dicts(_cur_contacts.fetchall(), _cur_contacts)
110
+ display_name_by_contact_id = {
111
+ str(row["contact_id"]): str(row["display_name"]).strip()
112
+ for row in contacts_rows
113
+ if row.get("contact_id") and row.get("display_name") and str(row["display_name"]).strip()
114
+ }
115
+
116
+ # Keep identifier fallback for participants that are already contact_ids.
117
+ _cur_cid = conn.execute(
118
+ f"""
119
+ SELECT contact_id, identifier, source_id
120
+ FROM contact_identifiers
121
+ WHERE dataset_id = ?
122
+ AND contact_id IN {contacts_in_clause}
123
+ ORDER BY CASE WHEN source_id = '*' THEN 1 ELSE 0 END, updated_at DESC
124
+ """,
125
+ tuple([dataset_id] + contacts_params),
126
+ )
127
+ contact_identifier_rows = _rows_to_dicts(_cur_cid.fetchall(), _cur_cid)
128
+
129
+ identifier_rows: List[Dict[str, Any]] = []
130
+ if all_identifier_candidates:
131
+ identifiers_in_clause, identifiers_params = _in_clause(all_identifier_candidates)
132
+ _cur_ident = conn.execute(
133
+ f"""
134
+ SELECT ci.contact_id, ci.identifier, ci.source_id, c.display_name
135
+ FROM contact_identifiers ci
136
+ LEFT JOIN contacts c
137
+ ON c.dataset_id = ci.dataset_id
138
+ AND c.contact_id = ci.contact_id
139
+ WHERE ci.dataset_id = ?
140
+ AND ci.identifier IN {identifiers_in_clause}
141
+ ORDER BY CASE WHEN ci.source_id = '*' THEN 1 ELSE 0 END, ci.updated_at DESC
142
+ """,
143
+ tuple([dataset_id] + identifiers_params),
144
+ )
145
+ identifier_rows = _rows_to_dicts(_cur_ident.fetchall(), _cur_ident)
146
+
147
+ best_identifier_by_contact_id: Dict[str, str] = {}
148
+ display_name_by_identifier: Dict[str, str] = {}
149
+ contact_ids_by_identifier: Dict[str, List[str]] = defaultdict(list)
150
+
151
+ def _index_identifier_rows(rows: Sequence[Dict[str, Any]]) -> None:
152
+ for row in rows:
153
+ contact_id = str(row.get("contact_id") or "").strip()
154
+ identifier = str(row.get("identifier") or "").strip()
155
+ display_name = str(row.get("display_name") or "").strip()
156
+ if not contact_id or not identifier:
157
+ continue
158
+ if display_name and contact_id not in display_name_by_contact_id:
159
+ display_name_by_contact_id[contact_id] = display_name
160
+ if contact_id not in best_identifier_by_contact_id:
161
+ best_identifier_by_contact_id[contact_id] = identifier
162
+ for candidate in _identifier_candidates(identifier):
163
+ if candidate and contact_id not in contact_ids_by_identifier[candidate]:
164
+ contact_ids_by_identifier[candidate].append(contact_id)
165
+ if candidate and display_name and candidate not in display_name_by_identifier:
166
+ display_name_by_identifier[candidate] = display_name
167
+
168
+ _index_identifier_rows(identifier_rows)
169
+
170
+ # Also index identifiers that belong to participant contact_ids directly (used for fallback labeling).
171
+ _index_identifier_rows(contact_identifier_rows)
172
+
173
+ secondary_identifier_candidates = sorted(
174
+ {
175
+ candidate
176
+ for identifier in best_identifier_by_contact_id.values()
177
+ for candidate in _identifier_candidates(identifier)
178
+ }
179
+ )
180
+ if secondary_identifier_candidates:
181
+ secondary_in_clause, secondary_params = _in_clause(secondary_identifier_candidates)
182
+ _cur_sec = conn.execute(
183
+ f"""
184
+ SELECT ci.contact_id, ci.identifier, ci.source_id, c.display_name
185
+ FROM contact_identifiers ci
186
+ LEFT JOIN contacts c
187
+ ON c.dataset_id = ci.dataset_id
188
+ AND c.contact_id = ci.contact_id
189
+ WHERE ci.dataset_id = ?
190
+ AND ci.identifier IN {secondary_in_clause}
191
+ ORDER BY CASE WHEN ci.source_id = '*' THEN 1 ELSE 0 END, ci.updated_at DESC
192
+ """,
193
+ tuple([dataset_id] + secondary_params),
194
+ )
195
+ secondary_rows = _rows_to_dicts(_cur_sec.fetchall(), _cur_sec)
196
+ _index_identifier_rows(secondary_rows)
197
+
198
+ for row in contact_identifier_rows:
199
+ contact_id = str(row.get("contact_id") or "").strip()
200
+ identifier = str(row.get("identifier") or "").strip()
201
+ display_name = str(row.get("display_name") or "").strip()
202
+ if contact_id and identifier and contact_id not in best_identifier_by_contact_id:
203
+ best_identifier_by_contact_id[contact_id] = identifier
204
+
205
+ out: Dict[str, Dict[str, str]] = {}
206
+ for participant_id in normalized_participants:
207
+ display_name = display_name_by_contact_id.get(participant_id, "")
208
+ identifier = best_identifier_by_contact_id.get(participant_id, "")
209
+
210
+ if not display_name:
211
+ matched_contact_id = ""
212
+ for candidate in participant_candidates.get(participant_id, set()):
213
+ contact_ids = contact_ids_by_identifier.get(candidate, [])
214
+ if not contact_ids:
215
+ continue
216
+ matched_contact_id = contact_ids[0]
217
+ if matched_contact_id:
218
+ break
219
+ if matched_contact_id:
220
+ display_name = display_name_by_contact_id.get(matched_contact_id, "") or display_name
221
+ identifier = best_identifier_by_contact_id.get(matched_contact_id, "") or identifier
222
+ if not display_name:
223
+ # Fallback to identifier-level display mapping (e.g., when contact row has sparse data).
224
+ for candidate in participant_candidates.get(participant_id, set()):
225
+ maybe_name = display_name_by_identifier.get(candidate, "")
226
+ if maybe_name:
227
+ display_name = maybe_name
228
+ break
229
+
230
+ if not identifier:
231
+ identifier = participant_id
232
+
233
+ # If this participant maps to an unnamed contact_id but we do have an identifier,
234
+ # try resolving that identifier to another contact with a display name
235
+ # (common after contact import where normalized phone variants point to different contact_ids).
236
+ if not display_name and identifier:
237
+ identifier_matched_contact_id = ""
238
+ fallback_contact_id = ""
239
+ for candidate in _identifier_candidates(identifier):
240
+ contact_ids = contact_ids_by_identifier.get(candidate, [])
241
+ if not contact_ids:
242
+ continue
243
+ named_ids = [cid for cid in contact_ids if display_name_by_contact_id.get(cid)]
244
+ if named_ids:
245
+ identifier_matched_contact_id = named_ids[0]
246
+ break
247
+ if not fallback_contact_id:
248
+ fallback_contact_id = contact_ids[0]
249
+ if not identifier_matched_contact_id and fallback_contact_id:
250
+ identifier_matched_contact_id = fallback_contact_id
251
+ if identifier_matched_contact_id:
252
+ display_name = display_name_by_contact_id.get(identifier_matched_contact_id, "") or display_name
253
+ identifier = best_identifier_by_contact_id.get(identifier_matched_contact_id, "") or identifier
254
+ label = display_name or identifier or participant_id
255
+ out[participant_id] = {
256
+ "label": label,
257
+ "display_name": display_name,
258
+ "identifier": identifier,
259
+ }
260
+ return out
261
+
262
+
263
+ def enrich_conversation_thread_previews(
264
+ conn: Any,
265
+ *,
266
+ dataset_id: str,
267
+ profile_identifier: str,
268
+ previews: List[Dict[str, Any]],
269
+ ) -> None:
270
+ """Mutates each message in ``previews``: adds ``sender_display_name`` and ``is_focus_contact``."""
271
+ senders: List[str] = []
272
+ for block in previews:
273
+ for m in block.get("messages") or []:
274
+ if not isinstance(m, dict):
275
+ continue
276
+ sid = str(m.get("sender_id") or "").strip()
277
+ if sid:
278
+ senders.append(sid)
279
+ labels = resolve_participant_labels(conn, dataset_id=dataset_id, participant_ids=senders)
280
+ for block in previews:
281
+ for m in block.get("messages") or []:
282
+ if not isinstance(m, dict):
283
+ continue
284
+ sid = str(m.get("sender_id") or "").strip()
285
+ info = labels.get(sid, {}) if sid else {}
286
+ label = str(info.get("label") or "").strip()
287
+ m["sender_display_name"] = label or sid or "Unknown"
288
+ m["is_focus_contact"] = bool(sid) and sender_matches_focus_identifier(sid, profile_identifier)
289
+
290
+
291
+ def enrich_contact_rows_with_resolved_display_names(
292
+ conn: Any,
293
+ *,
294
+ dataset_id: str,
295
+ contacts: List[Dict[str, Any]],
296
+ ) -> None:
297
+ """Fill empty ``display_name`` on owner/API contact rows (parity with messenger social graph).
298
+
299
+ ``list_contacts`` returns ``contacts.display_name`` per row only. Analytics uses
300
+ :func:`resolve_participant_labels` to promote names across identifier variants and
301
+ duplicate contact_ids (e.g. iMessage sender vs address-book import). Apply the same
302
+ resolution here so grant privacy UI and filters see the same labels as the graph.
303
+ """
304
+ participant_ids: List[str] = []
305
+ for c in contacts:
306
+ cid = str(c.get("contact_id") or "").strip()
307
+ if cid:
308
+ participant_ids.append(cid)
309
+ ident = str(c.get("identifier") or "").strip()
310
+ if ident:
311
+ participant_ids.append(ident)
312
+ if not participant_ids:
313
+ return
314
+ labels = resolve_participant_labels(conn, dataset_id=dataset_id, participant_ids=participant_ids)
315
+ for c in contacts:
316
+ if str(c.get("display_name") or "").strip():
317
+ continue
318
+ cid = str(c.get("contact_id") or "").strip()
319
+ resolved = str((labels.get(cid) or {}).get("display_name") or "").strip()
320
+ if resolved:
321
+ c["display_name"] = resolved
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ CHATGPT_DEV_PROFILE = {
4
+ "profile_id": "chatgpt_dev",
5
+ "queries": [
6
+ "messages_per_day",
7
+ "total_messages",
8
+ "messages_by_sender",
9
+ "avg_message_length",
10
+ ],
11
+ }
12
+
13
+ PROFILE_REGISTRY = {
14
+ CHATGPT_DEV_PROFILE["profile_id"]: CHATGPT_DEV_PROFILE,
15
+ # Allow per-source profile ids to map to the shared ChatGPT profile.
16
+ "chatgpt_file_ingestion": CHATGPT_DEV_PROFILE,
17
+ "chatgpt_ui_conversation": CHATGPT_DEV_PROFILE,
18
+ }
19
+
20
+
21
+ def get_profile(profile_id: str) -> dict | None:
22
+ return PROFILE_REGISTRY.get(profile_id)
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from .duckdb_adapter import DuckDBAdapter
6
+
7
+
8
+ class QueryEngine:
9
+ def __init__(self, adapter: DuckDBAdapter):
10
+ self.adapter = adapter
11
+
12
+ def query_messages_per_day(self, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
13
+ query = """
14
+ SELECT DATE(ts) as day, COUNT(*) as message_count
15
+ FROM projection.messages
16
+ """
17
+ params: List[Any] = []
18
+ if dataset_id:
19
+ query += " WHERE dataset_id = ?"
20
+ params.append(dataset_id)
21
+ query += " GROUP BY day ORDER BY day DESC"
22
+ return self.adapter.execute(query, params)
23
+
24
+ def query_total_messages(self, dataset_id: Optional[str] = None) -> Dict[str, Any]:
25
+ query = "SELECT COUNT(*) as total_messages FROM projection.messages"
26
+ params: List[Any] = []
27
+ if dataset_id:
28
+ query += " WHERE dataset_id = ?"
29
+ params.append(dataset_id)
30
+ rows = self.adapter.execute(query, params)
31
+ return rows[0] if rows else {"total_messages": 0}
32
+
33
+ def query_messages_by_sender(self, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
34
+ query = """
35
+ SELECT sender_type, COUNT(*) as count
36
+ FROM projection.messages
37
+ """
38
+ params: List[Any] = []
39
+ if dataset_id:
40
+ query += " WHERE dataset_id = ?"
41
+ params.append(dataset_id)
42
+ query += " GROUP BY sender_type ORDER BY count DESC"
43
+ return self.adapter.execute(query, params)
44
+
45
+ def query_avg_message_length(self, dataset_id: Optional[str] = None) -> Dict[str, Any]:
46
+ query = """
47
+ SELECT AVG(LENGTH(content)) as avg_length,
48
+ MIN(LENGTH(content)) as min_length,
49
+ MAX(LENGTH(content)) as max_length
50
+ FROM projection.messages
51
+ """
52
+ params: List[Any] = []
53
+ if dataset_id:
54
+ query += " WHERE dataset_id = ?"
55
+ params.append(dataset_id)
56
+ rows = self.adapter.execute(query, params)
57
+ if rows:
58
+ row = rows[0]
59
+ return {
60
+ "avg_length": float(row.get("avg_length") or 0),
61
+ "min_length": int(row.get("min_length") or 0),
62
+ "max_length": int(row.get("max_length") or 0),
63
+ }
64
+ return {"avg_length": 0.0, "min_length": 0, "max_length": 0}
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from collections import Counter, defaultdict
6
+ from datetime import datetime, timedelta, timezone
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from ..storage.raw.file_store import RawFileStore
11
+
12
+ logger = logging.getLogger("topos.analytics.raw_queries")
13
+
14
+
15
+ def _normalize_ts(value: Any) -> str:
16
+ if isinstance(value, (int, float)):
17
+ return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
18
+ if isinstance(value, str):
19
+ return value
20
+ return ""
21
+
22
+
23
+ def _normalize_sender(payload: dict) -> str:
24
+ role = (payload.get("role") or "").lower()
25
+ if role:
26
+ return "human" if role == "user" else role
27
+ sender_type = payload.get("sender_type")
28
+ return sender_type or "assistant"
29
+
30
+
31
+ def _message_from_payload(payload: dict, fallback_id: str, dataset_id: str) -> dict:
32
+ created_at = payload.get("created_at") or payload.get("ts")
33
+ out: Dict[str, Any] = {
34
+ "message_id": payload.get("id") or payload.get("message_id") or fallback_id,
35
+ "dataset_id": dataset_id,
36
+ "sender_type": _normalize_sender(payload),
37
+ "ts": _normalize_ts(created_at),
38
+ "content": payload.get("content", ""),
39
+ }
40
+ if payload.get("source_id") is not None:
41
+ out["source_id"] = str(payload["source_id"])
42
+ return out
43
+
44
+
45
+ def _parse_ts_to_datetime(ts: str) -> Optional[datetime]:
46
+ """Parse ISO-like ts string to datetime for comparison. Returns None if unparseable."""
47
+ if not ts:
48
+ return None
49
+ try:
50
+ if isinstance(ts, (int, float)):
51
+ return datetime.fromtimestamp(ts, tz=timezone.utc)
52
+ s = str(ts).strip()
53
+ if "T" in s:
54
+ dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
55
+ return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
56
+ return datetime.strptime(s[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc)
57
+ except (ValueError, TypeError):
58
+ return None
59
+
60
+
61
+ def _apply_filter_manifest_to_messages(
62
+ messages: List[Dict[str, Any]],
63
+ manifest: Optional[Dict[str, Any]],
64
+ ) -> List[Dict[str, Any]]:
65
+ """Apply filter_manifest (rolling_window_days, date_range, source_filter) in Python. Stage 2b."""
66
+ if not manifest or not isinstance(manifest, dict):
67
+ return messages
68
+ out: List[Dict[str, Any]] = []
69
+ now = datetime.now(timezone.utc)
70
+ rolling_days: Optional[int] = None
71
+ if manifest.get("rolling_window_days") is not None:
72
+ try:
73
+ rolling_days = max(0, int(manifest["rolling_window_days"]))
74
+ except (TypeError, ValueError):
75
+ pass
76
+ range_start: Optional[datetime] = None
77
+ if manifest.get("date_range_start"):
78
+ range_start = _parse_ts_to_datetime(str(manifest["date_range_start"]))
79
+ range_end: Optional[datetime] = None
80
+ if manifest.get("date_range_end"):
81
+ range_end = _parse_ts_to_datetime(str(manifest["date_range_end"]))
82
+ source_allow: Optional[List[str]] = None
83
+ if isinstance(manifest.get("source_filter"), list) and len(manifest["source_filter"]) > 0:
84
+ source_allow = [str(s) for s in manifest["source_filter"]]
85
+ for msg in messages:
86
+ ts_str = msg.get("ts")
87
+ dt = _parse_ts_to_datetime(ts_str) if ts_str else None
88
+ if rolling_days is not None and dt is not None:
89
+ if dt < now - timedelta(days=rolling_days):
90
+ continue
91
+ if range_start is not None and dt is not None and dt < range_start:
92
+ continue
93
+ if range_end is not None and dt is not None and dt > range_end:
94
+ continue
95
+ if source_allow is not None:
96
+ sid = msg.get("source_id")
97
+ if sid is not None and str(sid) not in source_allow:
98
+ continue
99
+ out.append(msg)
100
+ return out
101
+
102
+
103
+ def load_raw_messages(
104
+ *,
105
+ dataset_id: str,
106
+ schema_id: str,
107
+ limit: Optional[int] = None,
108
+ offset: int = 0,
109
+ filter_manifest: Optional[Dict[str, Any]] = None,
110
+ ) -> List[Dict[str, Any]]:
111
+ file_store = RawFileStore()
112
+ file_path = file_store.get_file_path(dataset_id, schema_id)
113
+ logger.debug(
114
+ "[PIPELINE:ANALYTICS] Loading raw messages: dataset_id=%s, schema_id=%s, file_path=%s, limit=%s, offset=%s",
115
+ dataset_id,
116
+ schema_id,
117
+ file_path,
118
+ limit,
119
+ offset,
120
+ )
121
+ if not file_path.exists():
122
+ logger.debug("[PIPELINE:ANALYTICS] Raw file does not exist: %s", file_path)
123
+ return []
124
+ messages: List[Dict[str, Any]] = []
125
+ with Path(file_path).open("r", encoding="utf-8") as handle:
126
+ for idx, line in enumerate(handle):
127
+ line = line.strip()
128
+ if not line:
129
+ continue
130
+ try:
131
+ payload = json.loads(line)
132
+ except json.JSONDecodeError:
133
+ continue
134
+ messages.append(_message_from_payload(payload, str(idx + 1), dataset_id))
135
+ messages = _apply_filter_manifest_to_messages(messages, filter_manifest)
136
+ if offset:
137
+ messages = messages[offset:]
138
+ if limit is not None:
139
+ messages = messages[:limit]
140
+ logger.debug(
141
+ "[PIPELINE:ANALYTICS] Loaded %d messages (after limit/offset)",
142
+ len(messages),
143
+ )
144
+ return messages
145
+
146
+
147
+ def messages_per_day(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
148
+ counts: dict[str, int] = defaultdict(int)
149
+ for message in messages:
150
+ ts = message.get("ts") or ""
151
+ if ts:
152
+ day = ts.split("T", 1)[0]
153
+ counts[day] += 1
154
+ return [{"day": day, "count": counts[day]} for day in sorted(counts.keys())]
155
+
156
+
157
+ def total_messages(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
158
+ return {"total_messages": len(messages)}
159
+
160
+
161
+ def messages_by_sender(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
162
+ counts = Counter(msg.get("sender_type") or "unknown" for msg in messages)
163
+ return [{"sender_type": sender, "count": count} for sender, count in counts.most_common()]
164
+
165
+
166
+ def avg_message_length(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
167
+ if not messages:
168
+ return {"avg_length": 0.0, "min_length": 0, "max_length": 0}
169
+ lengths = [len(msg.get("content") or "") for msg in messages]
170
+ return {
171
+ "avg_length": float(sum(lengths)) / len(lengths),
172
+ "min_length": min(lengths),
173
+ "max_length": max(lengths),
174
+ }
topos/api/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """API routers for Topos."""
topos/api/analytics.py ADDED
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ from fastapi import APIRouter, Query
6
+
7
+ from ..analytics.duckdb_adapter import DuckDBAdapter
8
+ from ..analytics.profiles import get_profile
9
+ from ..analytics.query_engine import QueryEngine
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ @router.get("/analytics")
15
+ async def get_analytics_endpoint(
16
+ query: Optional[str] = Query(None, description="Analytics query name"),
17
+ profile_id: Optional[str] = Query(None, description="Analytics profile id"),
18
+ dataset_id: Optional[str] = Query(None),
19
+ ) -> dict:
20
+ adapter = DuckDBAdapter()
21
+ engine = QueryEngine(adapter)
22
+
23
+ if profile_id:
24
+ profile = get_profile(profile_id)
25
+ if not profile:
26
+ return {"status": "error", "error": "unknown profile_id"}
27
+ results = {}
28
+ for item in profile["queries"]:
29
+ try:
30
+ if item == "messages_per_day":
31
+ results[item] = engine.query_messages_per_day(dataset_id=dataset_id)
32
+ elif item == "total_messages":
33
+ results[item] = engine.query_total_messages(dataset_id=dataset_id)
34
+ elif item == "messages_by_sender":
35
+ results[item] = engine.query_messages_by_sender(dataset_id=dataset_id)
36
+ elif item == "avg_message_length":
37
+ results[item] = engine.query_avg_message_length(dataset_id=dataset_id)
38
+ else:
39
+ results[item] = {"error": "unsupported query"}
40
+ except Exception:
41
+ results[item] = []
42
+ return {"profile_id": profile_id, "results": results}
43
+
44
+ if query == "messages_per_day":
45
+ return {"query": query, "result": engine.query_messages_per_day(dataset_id=dataset_id)}
46
+ if query == "total_messages":
47
+ return {"query": query, "result": engine.query_total_messages(dataset_id=dataset_id)}
48
+ if query == "messages_by_sender":
49
+ return {"query": query, "result": engine.query_messages_by_sender(dataset_id=dataset_id)}
50
+ if query == "avg_message_length":
51
+ return {"query": query, "result": engine.query_avg_message_length(dataset_id=dataset_id)}
52
+ return {"status": "stub", "query": query, "dataset_id": dataset_id}
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict
4
+
5
+ from fastapi import APIRouter, Body, Depends
6
+
7
+ from ..auth import require_api_key
8
+
9
+ router = APIRouter()
10
+
11
+
12
+ @router.get("/apps", dependencies=[Depends(require_api_key)])
13
+ async def list_apps() -> Dict[str, Any]:
14
+ return {"status": "stub", "apps": []}
15
+
16
+
17
+ @router.post("/apps", dependencies=[Depends(require_api_key)])
18
+ async def create_app(payload: Dict[str, Any] = Body(default_factory=dict)) -> Dict[str, Any]:
19
+ return {"status": "stub", "app": payload}
20
+
21
+
22
+ @router.get("/apps/{app_id}/sources", dependencies=[Depends(require_api_key)])
23
+ async def list_app_sources(app_id: str) -> Dict[str, Any]:
24
+ return {"status": "stub", "app_id": app_id, "sources": []}
25
+
26
+
27
+ @router.post("/apps/{app_id}/sources", dependencies=[Depends(require_api_key)])
28
+ async def create_app_source(
29
+ app_id: str, payload: Dict[str, Any] = Body(default_factory=dict)
30
+ ) -> Dict[str, Any]:
31
+ return {"status": "stub", "app_id": app_id, "source": payload}
topos/api/backup.py ADDED
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from fastapi import APIRouter
4
+
5
+ router = APIRouter()
6
+
7
+
8
+ @router.post("/backup")
9
+ async def backup_database() -> dict:
10
+ return {"status": "stub"}
11
+
12
+
13
+ @router.post("/restore")
14
+ async def restore_database() -> dict:
15
+ return {"status": "stub"}