topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,229 @@
1
+ """
2
+ Stage 9 schema registry: machine-readable contract for table/column canonical names,
3
+ types, and categories (informational vs organizational).
4
+
5
+ Source of truth: docs/SCHEMA_CONVENTIONS.md §7.
6
+ Used by: engine, control plane, and UI (via assist APIs).
7
+ Organizational columns are non-filterable by default.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ # -----------------------------------------------------------------------------
15
+ # Column entry: one row per (table, column) with current DB name, canonical name, type, category.
16
+ # For Stage 9 rename targets, current_column_name != canonical_column_name (current = name in DB today).
17
+ # -----------------------------------------------------------------------------
18
+
19
+ CATEGORY_INFORMATIONAL = "informational"
20
+ CATEGORY_ORGANIZATIONAL = "organizational"
21
+
22
+
23
+ def _row(
24
+ table: str,
25
+ current: str,
26
+ canonical: str,
27
+ type_name: str,
28
+ category: str,
29
+ ) -> Dict[str, Any]:
30
+ """Build a registry row; rename_target True when current != canonical."""
31
+ return {
32
+ "table_name": table,
33
+ "current_column_name": current,
34
+ "canonical_column_name": canonical,
35
+ "type": type_name,
36
+ "category": category,
37
+ "filterable_by_default": category == CATEGORY_INFORMATIONAL,
38
+ "rename_target": current != canonical,
39
+ }
40
+
41
+
42
+ # Registry: all Stage 9 mapped tables from SCHEMA_CONVENTIONS.md §7.1–§7.7.
43
+ # §7.0 rename targets: current = name in DB today; canonical = name after migration.
44
+ SCHEMA_REGISTRY: List[Dict[str, Any]] = [
45
+ # ----- conversation_messages (7.1) -----
46
+ _row("conversation_messages", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
47
+ _row("conversation_messages", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
48
+ _row("conversation_messages", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
49
+ _row("conversation_messages", "sender_type", "sender_type", "text", CATEGORY_INFORMATIONAL),
50
+ _row("conversation_messages", "sender_id", "sender_id", "text", CATEGORY_INFORMATIONAL),
51
+ _row("conversation_messages", "reply_to_message_id", "reply_to_message_id", "identifier", CATEGORY_ORGANIZATIONAL),
52
+ _row("conversation_messages", "message_type", "message_type", "text", CATEGORY_INFORMATIONAL),
53
+ _row("conversation_messages", "event_type", "event_type", "text", CATEGORY_INFORMATIONAL),
54
+ _row("conversation_messages", "content", "content", "text", CATEGORY_INFORMATIONAL),
55
+ _row("conversation_messages", "ts", "event_at", "timestamp_utc", CATEGORY_INFORMATIONAL), # rename
56
+ _row("conversation_messages", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
57
+ _row("conversation_messages", "metadata_json", "metadata_json", "json", CATEGORY_INFORMATIONAL),
58
+ _row("conversation_messages", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
59
+ _row("conversation_messages", "from_self", "is_from_self", "integer", CATEGORY_ORGANIZATIONAL), # rename
60
+ _row("conversation_messages", "owner_user_id", "owner_user_id", "identifier", CATEGORY_ORGANIZATIONAL),
61
+ # ----- ai_chat_messages (7.2) -----
62
+ _row("ai_chat_messages", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
63
+ _row("ai_chat_messages", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
64
+ _row("ai_chat_messages", "sender_type", "sender_type", "text", CATEGORY_INFORMATIONAL),
65
+ _row("ai_chat_messages", "sender_id", "sender_id", "text", CATEGORY_INFORMATIONAL),
66
+ _row("ai_chat_messages", "ts", "event_at", "timestamp_utc", CATEGORY_INFORMATIONAL), # rename
67
+ _row("ai_chat_messages", "content", "content", "text", CATEGORY_INFORMATIONAL),
68
+ _row("ai_chat_messages", "content_rendered", "content_rendered", "text", CATEGORY_INFORMATIONAL),
69
+ _row("ai_chat_messages", "metadata_json", "metadata_json", "json", CATEGORY_INFORMATIONAL),
70
+ _row("ai_chat_messages", "seq", "sequence", "integer", CATEGORY_ORGANIZATIONAL), # rename
71
+ _row("ai_chat_messages", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
72
+ # ----- ai_chat_conversations (7.3) -----
73
+ _row("ai_chat_conversations", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
74
+ _row("ai_chat_conversations", "owner_user_id", "owner_user_id", "identifier", CATEGORY_ORGANIZATIONAL),
75
+ _row("ai_chat_conversations", "title", "title", "text", CATEGORY_INFORMATIONAL),
76
+ _row("ai_chat_conversations", "source", "source_id", "text", CATEGORY_ORGANIZATIONAL), # rename
77
+ _row("ai_chat_conversations", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
78
+ _row("ai_chat_conversations", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
79
+ # ----- browser_visits (7.4) -----
80
+ _row("browser_visits", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
81
+ _row("browser_visits", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
82
+ _row("browser_visits", "url", "url", "text", CATEGORY_INFORMATIONAL),
83
+ _row("browser_visits", "visited_at", "visited_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
84
+ _row("browser_visits", "title", "title", "text", CATEGORY_INFORMATIONAL),
85
+ _row("browser_visits", "favicon_url", "favicon_url", "text", CATEGORY_INFORMATIONAL),
86
+ _row("browser_visits", "hostname", "hostname", "text", CATEGORY_INFORMATIONAL),
87
+ _row("browser_visits", "device_name", "device_name", "text", CATEGORY_INFORMATIONAL),
88
+ _row("browser_visits", "tab_id", "tab_id", "integer", CATEGORY_ORGANIZATIONAL),
89
+ _row("browser_visits", "window_id", "window_id", "integer", CATEGORY_ORGANIZATIONAL),
90
+ _row("browser_visits", "incognito", "incognito", "integer", CATEGORY_ORGANIZATIONAL),
91
+ _row("browser_visits", "transition_type", "transition_type", "text", CATEGORY_INFORMATIONAL),
92
+ _row("browser_visits", "pinned", "pinned", "integer", CATEGORY_ORGANIZATIONAL),
93
+ _row("browser_visits", "audible", "audible", "integer", CATEGORY_ORGANIZATIONAL),
94
+ _row("browser_visits", "muted", "muted", "integer", CATEGORY_ORGANIZATIONAL),
95
+ _row("browser_visits", "opener_tab_id", "opener_tab_id", "integer", CATEGORY_ORGANIZATIONAL),
96
+ _row("browser_visits", "referred_by", "referred_by", "text", CATEGORY_INFORMATIONAL),
97
+ _row("browser_visits", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
98
+ # ----- browser_events (7.5) -----
99
+ _row("browser_events", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
100
+ _row("browser_events", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
101
+ _row("browser_events", "event_type", "event_type", "text", CATEGORY_INFORMATIONAL),
102
+ _row("browser_events", "url", "url", "text", CATEGORY_INFORMATIONAL),
103
+ _row("browser_events", "visited_at", "visited_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
104
+ _row("browser_events", "title", "title", "text", CATEGORY_INFORMATIONAL),
105
+ _row("browser_events", "favicon_url", "favicon_url", "text", CATEGORY_INFORMATIONAL),
106
+ _row("browser_events", "hostname", "hostname", "text", CATEGORY_INFORMATIONAL),
107
+ _row("browser_events", "device_name", "device_name", "text", CATEGORY_INFORMATIONAL),
108
+ _row("browser_events", "transition_type", "transition_type", "text", CATEGORY_INFORMATIONAL),
109
+ _row("browser_events", "content", "content", "text", CATEGORY_INFORMATIONAL),
110
+ _row("browser_events", "tab_id", "tab_id", "integer", CATEGORY_ORGANIZATIONAL),
111
+ _row("browser_events", "window_id", "window_id", "integer", CATEGORY_ORGANIZATIONAL),
112
+ _row("browser_events", "incognito", "incognito", "integer", CATEGORY_ORGANIZATIONAL),
113
+ _row("browser_events", "pinned", "pinned", "integer", CATEGORY_ORGANIZATIONAL),
114
+ _row("browser_events", "audible", "audible", "integer", CATEGORY_ORGANIZATIONAL),
115
+ _row("browser_events", "muted", "muted", "integer", CATEGORY_ORGANIZATIONAL),
116
+ _row("browser_events", "opener_tab_id", "opener_tab_id", "integer", CATEGORY_ORGANIZATIONAL),
117
+ _row("browser_events", "starred_at", "starred_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
118
+ _row("browser_events", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
119
+ # ----- browser_url_classification (7.6) -----
120
+ _row("browser_url_classification", "source_table", "enriched_from_table", "identifier", CATEGORY_ORGANIZATIONAL), # rename
121
+ _row("browser_url_classification", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
122
+ _row("browser_url_classification", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
123
+ _row("browser_url_classification", "url", "url", "text", CATEGORY_INFORMATIONAL),
124
+ _row("browser_url_classification", "title", "title", "text", CATEGORY_INFORMATIONAL),
125
+ _row("browser_url_classification", "url_category", "url_category", "text", CATEGORY_INFORMATIONAL),
126
+ _row("browser_url_classification", "url_confidence", "url_confidence", "real", CATEGORY_INFORMATIONAL),
127
+ _row("browser_url_classification", "model_name", "model_name", "text", CATEGORY_ORGANIZATIONAL),
128
+ _row("browser_url_classification", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
129
+ _row("browser_url_classification", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
130
+ # ----- message_emotions (7.7) -----
131
+ _row("message_emotions", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
132
+ _row("message_emotions", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
133
+ _row("message_emotions", "emotion_label", "emotion_label", "text", CATEGORY_INFORMATIONAL),
134
+ _row("message_emotions", "confidence", "confidence", "real", CATEGORY_INFORMATIONAL),
135
+ _row("message_emotions", "model", "model_name", "text", CATEGORY_ORGANIZATIONAL), # rename
136
+ _row("message_emotions", "all_emotions", "all_emotions_json", "json", CATEGORY_INFORMATIONAL), # rename
137
+ _row("message_emotions", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
138
+ # ----- contacts (Stage 11: contacts:resolve — canonical messenger address book) -----
139
+ # See topos/storage/canonical/conversations_tables.py CREATE TABLE contacts / contact_identifiers
140
+ _row("contacts", "contact_id", "contact_id", "identifier", CATEGORY_ORGANIZATIONAL),
141
+ _row("contacts", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
142
+ _row("contacts", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
143
+ _row("contacts", "display_name", "display_name", "text", CATEGORY_INFORMATIONAL),
144
+ _row("contacts", "known_usernames_json", "known_usernames_json", "json", CATEGORY_INFORMATIONAL),
145
+ _row("contacts", "is_self", "is_self", "integer", CATEGORY_ORGANIZATIONAL),
146
+ _row("contacts", "last_import_source", "last_import_source", "text", CATEGORY_ORGANIZATIONAL),
147
+ _row("contacts", "last_import_run_id", "last_import_run_id", "identifier", CATEGORY_ORGANIZATIONAL),
148
+ _row("contacts", "last_imported_at", "last_imported_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
149
+ _row("contacts", "sharing_policy_json", "sharing_policy_json", "json", CATEGORY_INFORMATIONAL),
150
+ _row("contacts", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
151
+ _row("contacts", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
152
+ _row("contact_identifiers", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
153
+ _row("contact_identifiers", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
154
+ _row("contact_identifiers", "identifier", "identifier", "text", CATEGORY_INFORMATIONAL),
155
+ _row("contact_identifiers", "identifier_type", "identifier_type", "text", CATEGORY_INFORMATIONAL),
156
+ _row("contact_identifiers", "contact_id", "contact_id", "identifier", CATEGORY_ORGANIZATIONAL),
157
+ _row("contact_identifiers", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
158
+ _row("contact_identifiers", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
159
+ ]
160
+
161
+ # Tables covered by this registry (for validation and iteration).
162
+ STAGE_9_TABLE_NAMES = [
163
+ "conversation_messages",
164
+ "ai_chat_messages",
165
+ "ai_chat_conversations",
166
+ "browser_visits",
167
+ "browser_events",
168
+ "browser_url_classification",
169
+ "message_emotions",
170
+ "contacts",
171
+ "contact_identifiers",
172
+ ]
173
+
174
+
175
+ def get_columns_for_table(
176
+ table_name: str,
177
+ *,
178
+ include_organizational: bool = True,
179
+ use_canonical_names: bool = False,
180
+ ) -> List[Dict[str, Any]]:
181
+ """
182
+ Return column entries for a table. Used by assist APIs and UI.
183
+
184
+ include_organizational: If False, return only informational columns (default filterable set).
185
+ use_canonical_names: If True, return canonical_column_name as the column name to display/use;
186
+ if False, return current_column_name (name in DB today).
187
+ """
188
+ rows = [r for r in SCHEMA_REGISTRY if r["table_name"] == table_name]
189
+ if not include_organizational:
190
+ rows = [r for r in rows if r["category"] == CATEGORY_INFORMATIONAL]
191
+ if use_canonical_names:
192
+ return [{**r, "column_name": r["canonical_column_name"]} for r in rows]
193
+ return [{**r, "column_name": r["current_column_name"]} for r in rows]
194
+
195
+
196
+ def get_informational_columns(table_name: str) -> List[Dict[str, Any]]:
197
+ """Return only informational (filterable-by-default) columns for a table."""
198
+ return get_columns_for_table(table_name, include_organizational=False)
199
+
200
+
201
+ def get_rename_targets() -> List[Dict[str, Any]]:
202
+ """Return all columns that are Stage 9 migration rename targets (current != canonical)."""
203
+ return [r for r in SCHEMA_REGISTRY if r["rename_target"]]
204
+
205
+
206
+ def get_registry_as_list(
207
+ *,
208
+ include_organizational: bool = True,
209
+ ) -> List[Dict[str, Any]]:
210
+ """Return full registry as list of dicts (e.g. for JSON export or API)."""
211
+ if include_organizational:
212
+ return list(SCHEMA_REGISTRY)
213
+ return [r for r in SCHEMA_REGISTRY if r["category"] == CATEGORY_INFORMATIONAL]
214
+
215
+
216
+ def get_table_names() -> List[str]:
217
+ """Return list of table names in the registry."""
218
+ return list(STAGE_9_TABLE_NAMES)
219
+
220
+
221
+ def resolve_column_to_canonical(table_name: str, current_column_name: str) -> Optional[str]:
222
+ """
223
+ Given a table and the current (DB) column name, return the canonical name.
224
+ If not in registry or no rename, returns current_column_name (or None if unknown).
225
+ """
226
+ for r in SCHEMA_REGISTRY:
227
+ if r["table_name"] == table_name and r["current_column_name"] == current_column_name:
228
+ return r["canonical_column_name"]
229
+ return None
topos/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Topos control plane package root."""
2
+
3
+ from .__version__ import __version__, __version_info__
4
+
5
+ __all__ = ["__version__", "__version_info__"]
topos/__version__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Topos Control Plane Version Information."""
2
+
3
+ __version__ = "0.0.0"
4
+ __version_info__ = (0, 0, 0)
5
+
6
+ __all__ = ["__version__", "__version_info__"]
@@ -0,0 +1,15 @@
1
+ """Analytics layer for Topos."""
2
+
3
+ from .messenger_communities import (
4
+ compute_and_persist_messenger_analytics,
5
+ compute_importance_and_communities,
6
+ ensure_messenger_analytics_tables,
7
+ )
8
+ from .messenger_graph import extract_messenger_graph
9
+
10
+ __all__ = [
11
+ "extract_messenger_graph",
12
+ "compute_importance_and_communities",
13
+ "compute_and_persist_messenger_analytics",
14
+ "ensure_messenger_analytics_tables",
15
+ ]
@@ -0,0 +1,48 @@
1
+ """DuckDB adapter for analytics queries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ try:
10
+ import duckdb
11
+ except ImportError:
12
+ duckdb = None # type: ignore
13
+
14
+ logger = logging.getLogger("topos.analytics.duckdb")
15
+
16
+
17
+ class DuckDBAdapter:
18
+ def __init__(self, db_path: Optional[Path] = None):
19
+ if duckdb is None:
20
+ raise ImportError("duckdb package not installed")
21
+ self.conn = duckdb.connect(str(db_path) if db_path else ":memory:")
22
+
23
+ def attach_sqlite(self, sqlite_path: str) -> None:
24
+ escaped_path = sqlite_path.replace("'", "''")
25
+ self.conn.execute(f"ATTACH '{escaped_path}' AS projection (TYPE SQLITE)")
26
+
27
+ def query_jsonl_file(
28
+ self,
29
+ file_path: str,
30
+ dataset_id: Optional[str] = None,
31
+ ) -> List[Dict[str, Any]]:
32
+ escaped_path = file_path.replace("'", "''")
33
+ query = f"SELECT * FROM read_ndjson('{escaped_path}')"
34
+ if dataset_id:
35
+ query += f" WHERE dataset_id = '{dataset_id}'"
36
+ result = self.conn.execute(query).fetchall()
37
+ columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
38
+ return [dict(zip(columns, row)) for row in result]
39
+
40
+ def execute(self, query: str, params: Optional[List[Any]] = None) -> List[Dict[str, Any]]:
41
+ params = params or []
42
+ result = self.conn.execute(query, params).fetchall()
43
+ columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
44
+ return [dict(zip(columns, row)) for row in result]
45
+
46
+ def close(self) -> None:
47
+ if self.conn:
48
+ self.conn.close()
@@ -0,0 +1,349 @@
1
+ """Messenger graph importance and community detection (Sprint 02)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Dict, List, Optional, Sequence
8
+
9
+ import networkx as nx
10
+
11
+ from .messenger_graph import extract_messenger_graph
12
+
13
+ MESSENGER_SOCIAL_EDGES_TABLE = "messenger_social_edges"
14
+ MESSENGER_PARTICIPANT_IMPORTANCE_TABLE = "messenger_participant_importance"
15
+ MESSENGER_COMMUNITIES_TABLE = "messenger_communities"
16
+
17
+
18
+ def _utc_now() -> str:
19
+ return datetime.now(timezone.utc).isoformat()
20
+
21
+
22
+ def _source_scope(source_ids: Optional[Sequence[str]]) -> str:
23
+ if not source_ids:
24
+ return "all"
25
+ normalized = sorted({str(s).strip() for s in source_ids if str(s).strip()})
26
+ return ",".join(normalized) if normalized else "all"
27
+
28
+
29
+ def ensure_messenger_analytics_tables(conn: Any) -> None:
30
+ """Create Sprint 02 derived messenger analytics tables."""
31
+ conn.execute(
32
+ f"""
33
+ CREATE TABLE IF NOT EXISTS {MESSENGER_SOCIAL_EDGES_TABLE} (
34
+ dataset_id TEXT NOT NULL,
35
+ period_key TEXT NOT NULL,
36
+ source_scope TEXT NOT NULL DEFAULT 'all',
37
+ source_id TEXT NOT NULL,
38
+ target_id TEXT NOT NULL,
39
+ weight REAL NOT NULL,
40
+ edge_type TEXT,
41
+ edge_type_counts_json TEXT,
42
+ created_at TEXT NOT NULL,
43
+ updated_at TEXT NOT NULL,
44
+ PRIMARY KEY (dataset_id, period_key, source_scope, source_id, target_id)
45
+ )
46
+ """
47
+ )
48
+ conn.execute(
49
+ f"""
50
+ CREATE INDEX IF NOT EXISTS idx_{MESSENGER_SOCIAL_EDGES_TABLE}_dataset_period
51
+ ON {MESSENGER_SOCIAL_EDGES_TABLE}(dataset_id, period_key, source_scope)
52
+ """
53
+ )
54
+
55
+ conn.execute(
56
+ f"""
57
+ CREATE TABLE IF NOT EXISTS {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE} (
58
+ dataset_id TEXT NOT NULL,
59
+ period_key TEXT NOT NULL,
60
+ source_scope TEXT NOT NULL DEFAULT 'all',
61
+ participant_id TEXT NOT NULL,
62
+ centrality_degree REAL NOT NULL,
63
+ centrality_betweenness REAL NOT NULL,
64
+ created_at TEXT NOT NULL,
65
+ updated_at TEXT NOT NULL,
66
+ PRIMARY KEY (dataset_id, period_key, source_scope, participant_id)
67
+ )
68
+ """
69
+ )
70
+ conn.execute(
71
+ f"""
72
+ CREATE INDEX IF NOT EXISTS idx_{MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}_dataset_period
73
+ ON {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}(dataset_id, period_key, source_scope)
74
+ """
75
+ )
76
+
77
+ conn.execute(
78
+ f"""
79
+ CREATE TABLE IF NOT EXISTS {MESSENGER_COMMUNITIES_TABLE} (
80
+ dataset_id TEXT NOT NULL,
81
+ period_key TEXT NOT NULL,
82
+ source_scope TEXT NOT NULL DEFAULT 'all',
83
+ participant_id TEXT NOT NULL,
84
+ community_id INTEGER NOT NULL,
85
+ created_at TEXT NOT NULL,
86
+ updated_at TEXT NOT NULL,
87
+ PRIMARY KEY (dataset_id, period_key, source_scope, participant_id)
88
+ )
89
+ """
90
+ )
91
+ conn.execute(
92
+ f"""
93
+ CREATE INDEX IF NOT EXISTS idx_{MESSENGER_COMMUNITIES_TABLE}_dataset_period
94
+ ON {MESSENGER_COMMUNITIES_TABLE}(dataset_id, period_key, source_scope)
95
+ """
96
+ )
97
+ conn.commit()
98
+
99
+
100
+ def build_networkx_graph(period_payload: Dict[str, Any]) -> nx.Graph:
101
+ """Build an undirected weighted graph from Sprint 01 period payload."""
102
+ graph = nx.Graph()
103
+ for node in period_payload.get("nodes", []):
104
+ node_id = str(node.get("id") or "").strip()
105
+ if not node_id:
106
+ continue
107
+ graph.add_node(node_id, **node)
108
+
109
+ for edge in period_payload.get("edges", []):
110
+ source_id = str(edge.get("source") or "").strip()
111
+ target_id = str(edge.get("target") or "").strip()
112
+ if not source_id or not target_id or source_id == target_id:
113
+ continue
114
+ weight = float(edge.get("weight") or 0.0)
115
+ graph.add_edge(source_id, target_id, weight=max(weight, 0.0))
116
+ return graph
117
+
118
+
119
+ def compute_importance_and_communities(period_payload: Dict[str, Any]) -> Dict[str, Any]:
120
+ """Compute centrality metrics and Louvain communities for one period."""
121
+ graph = build_networkx_graph(period_payload)
122
+ node_ids = list(graph.nodes())
123
+ if not node_ids:
124
+ return {"importance": {}, "communities": {}, "graph": graph}
125
+
126
+ degree = nx.degree_centrality(graph)
127
+ # Use unweighted betweenness to keep metric stable with strength-based edges.
128
+ betweenness = nx.betweenness_centrality(graph, weight=None, normalized=True)
129
+
130
+ if graph.number_of_edges() > 0:
131
+ try:
132
+ from community import community_louvain # type: ignore
133
+
134
+ communities = community_louvain.best_partition(graph, weight="weight", random_state=42)
135
+ except Exception:
136
+ # Fallback keeps pipeline functional if python-louvain is unavailable at runtime.
137
+ communities = {}
138
+ for idx, component in enumerate(nx.connected_components(graph)):
139
+ for node_id in component:
140
+ communities[node_id] = idx
141
+ else:
142
+ communities = {node_id: idx for idx, node_id in enumerate(sorted(node_ids))}
143
+
144
+ importance: Dict[str, Dict[str, float]] = {}
145
+ for node_id in node_ids:
146
+ importance[node_id] = {
147
+ "centrality_degree": float(degree.get(node_id, 0.0)),
148
+ "centrality_betweenness": float(betweenness.get(node_id, 0.0)),
149
+ }
150
+ return {
151
+ "importance": importance,
152
+ "communities": {k: int(v) for k, v in communities.items()},
153
+ "graph": graph,
154
+ }
155
+
156
+
157
+ def _persist_period_results(
158
+ conn: Any,
159
+ *,
160
+ dataset_id: str,
161
+ period_key: str,
162
+ source_scope: str,
163
+ period_payload: Dict[str, Any],
164
+ importance: Dict[str, Dict[str, float]],
165
+ communities: Dict[str, int],
166
+ ) -> Dict[str, int]:
167
+ now = _utc_now()
168
+ conn.execute(
169
+ f"""
170
+ DELETE FROM {MESSENGER_SOCIAL_EDGES_TABLE}
171
+ WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
172
+ """,
173
+ (dataset_id, period_key, source_scope),
174
+ )
175
+ conn.execute(
176
+ f"""
177
+ DELETE FROM {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}
178
+ WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
179
+ """,
180
+ (dataset_id, period_key, source_scope),
181
+ )
182
+ conn.execute(
183
+ f"""
184
+ DELETE FROM {MESSENGER_COMMUNITIES_TABLE}
185
+ WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
186
+ """,
187
+ (dataset_id, period_key, source_scope),
188
+ )
189
+
190
+ edge_rows = []
191
+ for edge in period_payload.get("edges", []):
192
+ source_id = str(edge.get("source") or "").strip()
193
+ target_id = str(edge.get("target") or "").strip()
194
+ if not source_id or not target_id:
195
+ continue
196
+ edge_rows.append(
197
+ (
198
+ dataset_id,
199
+ period_key,
200
+ source_scope,
201
+ source_id,
202
+ target_id,
203
+ float(edge.get("weight") or 0.0),
204
+ str(edge.get("edge_type") or ""),
205
+ json.dumps(edge.get("edge_type_counts") or {}, ensure_ascii=False),
206
+ now,
207
+ now,
208
+ )
209
+ )
210
+ if edge_rows:
211
+ conn.executemany(
212
+ f"""
213
+ INSERT INTO {MESSENGER_SOCIAL_EDGES_TABLE}
214
+ (
215
+ dataset_id, period_key, source_scope, source_id, target_id,
216
+ weight, edge_type, edge_type_counts_json, created_at, updated_at
217
+ )
218
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
219
+ """,
220
+ edge_rows,
221
+ )
222
+
223
+ importance_rows = []
224
+ for participant_id, metrics in importance.items():
225
+ importance_rows.append(
226
+ (
227
+ dataset_id,
228
+ period_key,
229
+ source_scope,
230
+ participant_id,
231
+ float(metrics.get("centrality_degree", 0.0)),
232
+ float(metrics.get("centrality_betweenness", 0.0)),
233
+ now,
234
+ now,
235
+ )
236
+ )
237
+ if importance_rows:
238
+ conn.executemany(
239
+ f"""
240
+ INSERT INTO {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}
241
+ (
242
+ dataset_id, period_key, source_scope, participant_id,
243
+ centrality_degree, centrality_betweenness, created_at, updated_at
244
+ )
245
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
246
+ """,
247
+ importance_rows,
248
+ )
249
+
250
+ community_rows = []
251
+ for participant_id, community_id in communities.items():
252
+ community_rows.append(
253
+ (
254
+ dataset_id,
255
+ period_key,
256
+ source_scope,
257
+ participant_id,
258
+ int(community_id),
259
+ now,
260
+ now,
261
+ )
262
+ )
263
+ if community_rows:
264
+ conn.executemany(
265
+ f"""
266
+ INSERT INTO {MESSENGER_COMMUNITIES_TABLE}
267
+ (
268
+ dataset_id, period_key, source_scope, participant_id,
269
+ community_id, created_at, updated_at
270
+ )
271
+ VALUES (?, ?, ?, ?, ?, ?, ?)
272
+ """,
273
+ community_rows,
274
+ )
275
+ conn.commit()
276
+ return {
277
+ "edges_written": len(edge_rows),
278
+ "importance_written": len(importance_rows),
279
+ "communities_written": len(community_rows),
280
+ }
281
+
282
+
283
+ def compute_and_persist_messenger_analytics(
284
+ *,
285
+ dataset_id: str,
286
+ conn: Optional[Any] = None,
287
+ start_ts: Optional[str] = None,
288
+ end_ts: Optional[str] = None,
289
+ source_ids: Optional[Sequence[str]] = None,
290
+ period_granularity: str = "month",
291
+ cumulative: bool = False,
292
+ ) -> Dict[str, Any]:
293
+ """Run Sprint 01 extraction + Sprint 02 metrics and persist derived analytics."""
294
+ if conn is not None:
295
+ db = conn
296
+ else:
297
+ from ..core.state import get_db_connection
298
+
299
+ db = get_db_connection()
300
+ if db is None:
301
+ raise RuntimeError("Database connection not available")
302
+
303
+ ensure_messenger_analytics_tables(db)
304
+ extraction = extract_messenger_graph(
305
+ dataset_id=dataset_id,
306
+ conn=db,
307
+ start_ts=start_ts,
308
+ end_ts=end_ts,
309
+ source_ids=source_ids,
310
+ period_granularity=period_granularity,
311
+ cumulative=cumulative,
312
+ )
313
+
314
+ scope = _source_scope(source_ids)
315
+ periods_out: List[Dict[str, Any]] = []
316
+ totals = {"edges_written": 0, "importance_written": 0, "communities_written": 0}
317
+
318
+ for period_payload in extraction.get("periods", []):
319
+ period_key = str(period_payload.get("period_key") or "")
320
+ if not period_key:
321
+ continue
322
+ computed = compute_importance_and_communities(period_payload)
323
+ writes = _persist_period_results(
324
+ db,
325
+ dataset_id=dataset_id,
326
+ period_key=period_key,
327
+ source_scope=scope,
328
+ period_payload=period_payload,
329
+ importance=computed["importance"],
330
+ communities=computed["communities"],
331
+ )
332
+ for key in totals:
333
+ totals[key] += writes[key]
334
+ periods_out.append(
335
+ {
336
+ "period_key": period_key,
337
+ **writes,
338
+ "nodes_count": len(period_payload.get("nodes", [])),
339
+ "edges_count": len(period_payload.get("edges", [])),
340
+ }
341
+ )
342
+
343
+ return {
344
+ "dataset_id": dataset_id,
345
+ "period_granularity": period_granularity,
346
+ "source_scope": scope,
347
+ "periods": periods_out,
348
+ "totals": totals,
349
+ }