topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,295 @@
1
+ """Raw tables manager for storing original payloads before canonicalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import sqlite3
8
+ from typing import Any, Dict, Optional
9
+
10
+ logger = logging.getLogger("topos.storage.raw.raw_tables_manager")
11
+
12
+
13
+ class RawTablesManager:
14
+ """Manages raw retention tables for storing original payloads.
15
+
16
+ According to architecture, raw tables are per-connector:
17
+ - `raw_chat_messages_{source}` for chat sources
18
+ - `raw_{source}_events` for event sources
19
+ """
20
+
21
+ def __init__(self, conn: sqlite3.Connection):
22
+ """Initialize with database connection."""
23
+ self.conn = conn
24
+
25
+ def get_raw_table_name(self, source_id: str, source_type: str = "chat_messages") -> str:
26
+ """Get raw table name for a source.
27
+
28
+ Args:
29
+ source_id: Source identifier (e.g., "chatgpt", "chatgpt_ui_conversation")
30
+ source_type: Type of data ("chat_messages", "events", etc.)
31
+
32
+ Returns:
33
+ Table name like "raw_chat_messages_chatgpt"
34
+ """
35
+ # Extract base source name (remove prefixes like "dev_test_")
36
+ if source_id in ("browser_visits", "browser_events", "starred_websites"):
37
+ base_source = source_id.replace("_", "")
38
+ else:
39
+ base_source = source_id
40
+ if "_" in source_id:
41
+ # For "chatgpt_ui_conversation", extract "chatgpt"
42
+ parts = source_id.split("_")
43
+ # Find the actual source name (usually after prefixes)
44
+ for part in parts:
45
+ if part in ["chatgpt", "grok", "claude", "gemini"]:
46
+ base_source = part
47
+ break
48
+ # If no known source found, use the last meaningful part
49
+ if base_source == source_id:
50
+ # For "chatgpt_ui_conversation", use "chatgpt_ui_conversation"
51
+ # but normalize to just the source type
52
+ if "chatgpt" in source_id.lower():
53
+ base_source = "chatgpt"
54
+ elif "grok" in source_id.lower():
55
+ base_source = "grok"
56
+ else:
57
+ # Fallback: use a sanitized version
58
+ base_source = source_id.replace("dev_test_", "").replace("_", "")
59
+
60
+ if source_type == "chat_messages":
61
+ return f"raw_chat_messages_{base_source}"
62
+ else:
63
+ return f"raw_{base_source}_{source_type}"
64
+
65
+ def ensure_raw_table(self, table_name: str) -> None:
66
+ """Ensure raw table exists with proper schema.
67
+
68
+ Raw tables store original payloads verbatim with:
69
+ - source_system: Source identifier
70
+ - source_record_id: Unique record ID within source
71
+ - payload_json: Original payload as JSON string
72
+ - created_at: Timestamp when record was stored
73
+ - Uniqueness: (source_system, source_record_id)
74
+ """
75
+ try:
76
+ if table_name == "raw_chat_messages_browservisits":
77
+ self._ensure_browser_visits_raw_table(table_name)
78
+ return
79
+ self.conn.execute(f"""
80
+ CREATE TABLE IF NOT EXISTS {table_name} (
81
+ source_system TEXT NOT NULL,
82
+ source_record_id TEXT NOT NULL,
83
+ payload_json TEXT NOT NULL,
84
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
85
+ PRIMARY KEY (source_system, source_record_id)
86
+ )
87
+ """)
88
+
89
+ # Create indexes
90
+ self.conn.execute(f"""
91
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_source_system
92
+ ON {table_name}(source_system)
93
+ """)
94
+
95
+ self.conn.execute(f"""
96
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_created_at
97
+ ON {table_name}(created_at)
98
+ """)
99
+
100
+ self.conn.commit()
101
+ except Exception as e:
102
+ self.conn.rollback()
103
+ logger.error("Failed to ensure raw table %s: %s", table_name, e)
104
+ raise
105
+
106
+ def _ensure_browser_visits_raw_table(self, table_name: str) -> None:
107
+ """Ensure browser visits raw table uses normalized columns (no payload_json)."""
108
+ cursor = self.conn.execute(
109
+ "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
110
+ (table_name,),
111
+ )
112
+ table_exists = cursor.fetchone() is not None
113
+
114
+ def _create_schema(target_name: str) -> None:
115
+ self.conn.execute(f"""
116
+ CREATE TABLE IF NOT EXISTS {target_name} (
117
+ source_system TEXT NOT NULL,
118
+ source_record_id TEXT NOT NULL,
119
+ record_id TEXT,
120
+ dataset_id TEXT,
121
+ url TEXT,
122
+ visited_at TEXT,
123
+ title TEXT,
124
+ favicon_url TEXT,
125
+ hostname TEXT,
126
+ device_name TEXT,
127
+ tab_id INTEGER,
128
+ window_id INTEGER,
129
+ incognito INTEGER,
130
+ transition_type TEXT,
131
+ pinned INTEGER,
132
+ audible INTEGER,
133
+ muted INTEGER,
134
+ opener_tab_id INTEGER,
135
+ referred_by TEXT,
136
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
137
+ PRIMARY KEY (source_system, source_record_id)
138
+ )
139
+ """)
140
+
141
+ if not table_exists:
142
+ _create_schema(table_name)
143
+ else:
144
+ existing_cols_cursor = self.conn.execute(f"PRAGMA table_info({table_name})")
145
+ existing_cols = {row[1] for row in existing_cols_cursor.fetchall()}
146
+ needs_migration = "payload_json" in existing_cols or "url" not in existing_cols
147
+ if needs_migration:
148
+ pre_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
149
+ tmp_table = f"{table_name}__migrated"
150
+ self.conn.execute(f"DROP TABLE IF EXISTS {tmp_table}")
151
+ _create_schema(tmp_table)
152
+ self.conn.execute(f"""
153
+ INSERT OR REPLACE INTO {tmp_table} (
154
+ source_system, source_record_id, record_id, dataset_id, url, visited_at, title,
155
+ favicon_url, hostname, device_name, tab_id, window_id, incognito, transition_type,
156
+ pinned, audible, muted, opener_tab_id, referred_by, created_at
157
+ )
158
+ SELECT
159
+ source_system,
160
+ source_record_id,
161
+ COALESCE(json_extract(payload_json, '$.record_id'), source_record_id),
162
+ json_extract(payload_json, '$.dataset_id'),
163
+ json_extract(payload_json, '$.url'),
164
+ json_extract(payload_json, '$.visited_at'),
165
+ json_extract(payload_json, '$.title'),
166
+ json_extract(payload_json, '$.favicon_url'),
167
+ json_extract(payload_json, '$.hostname'),
168
+ json_extract(payload_json, '$.device_name'),
169
+ CAST(json_extract(payload_json, '$.tab_id') AS INTEGER),
170
+ CAST(json_extract(payload_json, '$.window_id') AS INTEGER),
171
+ CASE
172
+ WHEN json_extract(payload_json, '$.incognito') IN (1, '1', 'true', 'TRUE') THEN 1
173
+ WHEN json_extract(payload_json, '$.incognito') IN (0, '0', 'false', 'FALSE') THEN 0
174
+ ELSE NULL
175
+ END,
176
+ json_extract(payload_json, '$.transition_type'),
177
+ CASE
178
+ WHEN json_extract(payload_json, '$.pinned') IN (1, '1', 'true', 'TRUE') THEN 1
179
+ WHEN json_extract(payload_json, '$.pinned') IN (0, '0', 'false', 'FALSE') THEN 0
180
+ ELSE NULL
181
+ END,
182
+ CASE
183
+ WHEN json_extract(payload_json, '$.audible') IN (1, '1', 'true', 'TRUE') THEN 1
184
+ WHEN json_extract(payload_json, '$.audible') IN (0, '0', 'false', 'FALSE') THEN 0
185
+ ELSE NULL
186
+ END,
187
+ CASE
188
+ WHEN json_extract(payload_json, '$.muted') IN (1, '1', 'true', 'TRUE') THEN 1
189
+ WHEN json_extract(payload_json, '$.muted') IN (0, '0', 'false', 'FALSE') THEN 0
190
+ ELSE NULL
191
+ END,
192
+ CAST(json_extract(payload_json, '$.opener_tab_id') AS INTEGER),
193
+ json_extract(payload_json, '$.referred_by'),
194
+ COALESCE(created_at, datetime('now'))
195
+ FROM {table_name}
196
+ """)
197
+ self.conn.execute(f"DROP TABLE {table_name}")
198
+ self.conn.execute(f"ALTER TABLE {tmp_table} RENAME TO {table_name}")
199
+ post_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
200
+ logger.info(
201
+ "[PIPELINE:RAW] Migrated %s to normalized schema: rows_before=%d rows_after=%d",
202
+ table_name,
203
+ pre_count,
204
+ post_count,
205
+ )
206
+
207
+ self.conn.execute(f"""
208
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_source_system
209
+ ON {table_name}(source_system)
210
+ """)
211
+ self.conn.execute(f"""
212
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_created_at
213
+ ON {table_name}(created_at)
214
+ """)
215
+ self.conn.execute(f"""
216
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_visited_at
217
+ ON {table_name}(visited_at)
218
+ """)
219
+ self.conn.execute(f"""
220
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_url
221
+ ON {table_name}(url)
222
+ """)
223
+ self.conn.commit()
224
+
225
+ def write_raw_record(
226
+ self,
227
+ source_id: str,
228
+ source_record_id: str,
229
+ payload: Dict[str, Any],
230
+ source_type: str = "chat_messages",
231
+ ) -> None:
232
+ """Write raw record to raw table.
233
+
234
+ Args:
235
+ source_id: Source identifier
236
+ source_record_id: Unique record ID within source
237
+ payload: Original payload dictionary
238
+ source_type: Type of data ("chat_messages", "events", etc.)
239
+ """
240
+ table_name = self.get_raw_table_name(source_id, source_type)
241
+ self.ensure_raw_table(table_name)
242
+
243
+ try:
244
+ if table_name == "raw_chat_messages_browservisits":
245
+ self.conn.execute(f"""
246
+ INSERT OR REPLACE INTO {table_name}
247
+ (
248
+ source_system, source_record_id, record_id, dataset_id, url, visited_at, title,
249
+ favicon_url, hostname, device_name, tab_id, window_id, incognito, transition_type,
250
+ pinned, audible, muted, opener_tab_id, referred_by, created_at
251
+ )
252
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
253
+ """, (
254
+ source_id,
255
+ source_record_id,
256
+ payload.get("record_id") or source_record_id,
257
+ payload.get("dataset_id"),
258
+ payload.get("url"),
259
+ payload.get("visited_at"),
260
+ payload.get("title"),
261
+ payload.get("favicon_url"),
262
+ payload.get("hostname"),
263
+ payload.get("device_name"),
264
+ payload.get("tab_id") if isinstance(payload.get("tab_id"), int) else None,
265
+ payload.get("window_id") if isinstance(payload.get("window_id"), int) else None,
266
+ 1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
267
+ payload.get("transition_type"),
268
+ 1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
269
+ 1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
270
+ 1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
271
+ payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), int) else None,
272
+ payload.get("referred_by"),
273
+ ))
274
+ self.conn.commit()
275
+ return
276
+
277
+ # Store payload as JSON string
278
+ payload_json = json.dumps(payload, ensure_ascii=False)
279
+
280
+ self.conn.execute(f"""
281
+ INSERT OR REPLACE INTO {table_name}
282
+ (source_system, source_record_id, payload_json, created_at)
283
+ VALUES (?, ?, ?, datetime('now'))
284
+ """, (source_id, source_record_id, payload_json))
285
+
286
+ self.conn.commit()
287
+ except Exception as e:
288
+ self.conn.rollback()
289
+ logger.error(
290
+ "[PIPELINE:RAW] Failed to store raw record: source=%s, record_id=%s, error=%s",
291
+ source_id,
292
+ source_record_id,
293
+ e,
294
+ )
295
+ raise
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict
4
+
5
+ from .raw_store import RawRecordRef, RawStore
6
+
7
+
8
+ class SQLiteRawStore(RawStore):
9
+ def __init__(self, db_conn):
10
+ self.db_conn = db_conn
11
+
12
+ def write_file(self, file): # pragma: no cover - not used for sqlite raw store
13
+ raise NotImplementedError
14
+
15
+ def write_record(self, record: Dict[str, str]) -> RawRecordRef:
16
+ _ = record
17
+ raise NotImplementedError("SQLiteRawStore not implemented yet")
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class EncryptionManager:
5
+ """No-op encryption manager placeholder."""
6
+
7
+ def __init__(self, dataset_id: str, user_id: str):
8
+ self.dataset_id = dataset_id
9
+ self.user_id = user_id
10
+ self._crypto_version = 1
11
+
12
+ def get_crypto_version(self) -> int:
13
+ return self._crypto_version
14
+
15
+ def encrypt_str(self, payload: str, crypto_version: int | None = None) -> str:
16
+ _ = crypto_version
17
+ return payload
18
+
19
+ def decrypt_str(self, ciphertext: str, crypto_version: int | None = None) -> str:
20
+ _ = crypto_version
21
+ return ciphertext
@@ -0,0 +1,71 @@
1
+ """Signal identity storage: my_phone_number (and optional my_signal_id) per dataset.
2
+
3
+ Used to set sender_type (self vs contact) and owner when ingesting Signal messages.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ logger = logging.getLogger("topos.storage.signal_identity")
12
+
13
+ TABLE = "signal_identity"
14
+
15
+
16
+ def ensure_table(conn) -> None:
17
+ """Create signal_identity table if not exists."""
18
+ conn.execute(f"""
19
+ CREATE TABLE IF NOT EXISTS {TABLE} (
20
+ dataset_id TEXT NOT NULL PRIMARY KEY,
21
+ my_phone_number TEXT,
22
+ my_signal_id TEXT,
23
+ updated_at TEXT DEFAULT (datetime('now'))
24
+ )
25
+ """)
26
+ conn.commit()
27
+
28
+
29
+ def get_signal_identity(conn, dataset_id: str) -> Optional[dict]:
30
+ """Return { my_phone_number, my_signal_id } for dataset_id, or None."""
31
+ if not conn or not dataset_id:
32
+ return None
33
+ try:
34
+ ensure_table(conn)
35
+ row = conn.execute(
36
+ f"SELECT my_phone_number, my_signal_id FROM {TABLE} WHERE dataset_id = ?",
37
+ (dataset_id,),
38
+ ).fetchone()
39
+ if not row:
40
+ return None
41
+ return {"my_phone_number": row[0], "my_signal_id": row[1]}
42
+ except Exception as e:
43
+ logger.warning("get_signal_identity failed: %s", e)
44
+ return None
45
+
46
+
47
+ def put_signal_identity(
48
+ conn,
49
+ dataset_id: str,
50
+ *,
51
+ my_phone_number: Optional[str] = None,
52
+ my_signal_id: Optional[str] = None,
53
+ ) -> None:
54
+ """Set Signal identity for dataset_id. Pass None to leave a field unchanged."""
55
+ if not conn or not dataset_id:
56
+ return
57
+ try:
58
+ ensure_table(conn)
59
+ existing = get_signal_identity(conn, dataset_id)
60
+ phone = my_phone_number if my_phone_number is not None else (existing.get("my_phone_number") if existing else None)
61
+ sid = my_signal_id if my_signal_id is not None else (existing.get("my_signal_id") if existing else None)
62
+ conn.execute(
63
+ f"""
64
+ INSERT OR REPLACE INTO {TABLE} (dataset_id, my_phone_number, my_signal_id, updated_at)
65
+ VALUES (?, ?, ?, datetime('now'))
66
+ """,
67
+ (dataset_id, phone, sid),
68
+ )
69
+ conn.commit()
70
+ except Exception as e:
71
+ logger.warning("put_signal_identity failed: %s", e)
@@ -0,0 +1,116 @@
1
+ """Per-source settings: enabled, last_sync_at, last_error (for local_sync sources)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Optional
7
+
8
+ logger = logging.getLogger("topos.storage.source_settings")
9
+
10
+ TABLE = "user_ingestion_sources"
11
+
12
+
13
+ def ensure_table(conn) -> None:
14
+ conn.execute(f"""
15
+ CREATE TABLE IF NOT EXISTS {TABLE} (
16
+ dataset_id TEXT NOT NULL,
17
+ source_id TEXT NOT NULL,
18
+ enabled INTEGER NOT NULL DEFAULT 1,
19
+ last_sync_at TEXT,
20
+ last_error TEXT,
21
+ updated_at TEXT DEFAULT (datetime('now')),
22
+ PRIMARY KEY (dataset_id, source_id)
23
+ )
24
+ """)
25
+ conn.commit()
26
+
27
+
28
+ def get_source_settings(conn, dataset_id: str, source_id: str) -> Optional[dict]:
29
+ """Return { enabled, last_sync_at, last_error } or None (then defaults: enabled true, no last_*)."""
30
+ if not conn or not dataset_id or not source_id:
31
+ return None
32
+ try:
33
+ ensure_table(conn)
34
+ row = conn.execute(
35
+ f"SELECT enabled, last_sync_at, last_error FROM {TABLE} WHERE dataset_id = ? AND source_id = ?",
36
+ (dataset_id, source_id),
37
+ ).fetchone()
38
+ if not row:
39
+ return {"enabled": True, "last_sync_at": None, "last_error": None}
40
+ return {"enabled": bool(row[0]), "last_sync_at": row[1], "last_error": row[2]}
41
+ except Exception as e:
42
+ logger.warning("get_source_settings failed: %s", e)
43
+ return {"enabled": True, "last_sync_at": None, "last_error": None}
44
+
45
+
46
+ def put_source_settings(
47
+ conn,
48
+ dataset_id: str,
49
+ source_id: str,
50
+ *,
51
+ enabled: Optional[bool] = None,
52
+ ) -> None:
53
+ """Update enabled; leave last_sync_at/last_error unchanged."""
54
+ if not conn or not dataset_id or not source_id or enabled is None:
55
+ return
56
+ try:
57
+ ensure_table(conn)
58
+ cur = conn.execute(
59
+ f"SELECT 1 FROM {TABLE} WHERE dataset_id = ? AND source_id = ?",
60
+ (dataset_id, source_id),
61
+ ).fetchone()
62
+ if cur:
63
+ conn.execute(
64
+ f"UPDATE {TABLE} SET enabled = ?, updated_at = datetime('now') WHERE dataset_id = ? AND source_id = ?",
65
+ (1 if enabled else 0, dataset_id, source_id),
66
+ )
67
+ else:
68
+ conn.execute(
69
+ f"INSERT INTO {TABLE} (dataset_id, source_id, enabled, updated_at) VALUES (?, ?, ?, datetime('now'))",
70
+ (dataset_id, source_id, 1 if enabled else 0),
71
+ )
72
+ conn.commit()
73
+ except Exception as e:
74
+ logger.warning("put_source_settings failed: %s", e)
75
+
76
+
77
+ def update_sync_result(
78
+ conn,
79
+ dataset_id: str,
80
+ source_id: str,
81
+ *,
82
+ success: bool,
83
+ last_sync_at: Optional[str] = None,
84
+ last_error: Optional[str] = None,
85
+ ) -> None:
86
+ """After sync: set last_sync_at (and clear last_error) on success, or set last_error on failure."""
87
+ if not conn or not dataset_id or not source_id:
88
+ return
89
+ try:
90
+ ensure_table(conn)
91
+ if success:
92
+ conn.execute(
93
+ f"""
94
+ INSERT INTO {TABLE} (dataset_id, source_id, enabled, last_sync_at, last_error, updated_at)
95
+ VALUES (?, ?, 1, ?, NULL, datetime('now'))
96
+ ON CONFLICT(dataset_id, source_id) DO UPDATE SET
97
+ last_sync_at = ?,
98
+ last_error = NULL,
99
+ updated_at = datetime('now')
100
+ """,
101
+ (dataset_id, source_id, last_sync_at or "", last_sync_at or ""),
102
+ )
103
+ else:
104
+ conn.execute(
105
+ f"""
106
+ INSERT INTO {TABLE} (dataset_id, source_id, enabled, last_sync_at, last_error, updated_at)
107
+ VALUES (?, ?, 1, NULL, ?, datetime('now'))
108
+ ON CONFLICT(dataset_id, source_id) DO UPDATE SET
109
+ last_error = ?,
110
+ updated_at = datetime('now')
111
+ """,
112
+ (dataset_id, source_id, last_error or "", last_error or ""),
113
+ )
114
+ conn.commit()
115
+ except Exception as e:
116
+ logger.warning("update_sync_result failed: %s", e)
@@ -0,0 +1,69 @@
1
+ """Dataset-scoped owner identity storage for self-name resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Optional
7
+
8
+ logger = logging.getLogger("topos.storage.user_identity")
9
+
10
+ TABLE = "user_identity"
11
+
12
+
13
+ def ensure_table(conn) -> None:
14
+ """Create ``user_identity`` table if it does not exist."""
15
+ conn.execute(
16
+ f"""
17
+ CREATE TABLE IF NOT EXISTS {TABLE} (
18
+ dataset_id TEXT NOT NULL PRIMARY KEY,
19
+ display_name TEXT,
20
+ updated_at TEXT DEFAULT (datetime('now'))
21
+ )
22
+ """
23
+ )
24
+ conn.commit()
25
+
26
+
27
+ def get_user_identity(conn, dataset_id: str) -> Optional[dict]:
28
+ """Return ``{display_name}`` for a dataset, or ``None`` when absent."""
29
+ if not conn or not dataset_id:
30
+ return None
31
+ try:
32
+ ensure_table(conn)
33
+ row = conn.execute(
34
+ f"SELECT display_name FROM {TABLE} WHERE dataset_id = ?",
35
+ (dataset_id,),
36
+ ).fetchone()
37
+ if not row:
38
+ return None
39
+ return {"display_name": row[0]}
40
+ except Exception as e:
41
+ logger.warning("get_user_identity failed: %s", e)
42
+ return None
43
+
44
+
45
+ def put_user_identity(
46
+ conn,
47
+ dataset_id: str,
48
+ *,
49
+ display_name: Optional[str] = None,
50
+ ) -> None:
51
+ """Set dataset-scoped owner identity. ``None`` leaves the field unchanged."""
52
+ if not conn or not dataset_id:
53
+ return
54
+ try:
55
+ ensure_table(conn)
56
+ existing = get_user_identity(conn, dataset_id)
57
+ next_display_name = (
58
+ display_name if display_name is not None else (existing.get("display_name") if existing else None)
59
+ )
60
+ conn.execute(
61
+ f"""
62
+ INSERT OR REPLACE INTO {TABLE} (dataset_id, display_name, updated_at)
63
+ VALUES (?, ?, datetime('now'))
64
+ """,
65
+ (dataset_id, next_display_name),
66
+ )
67
+ conn.commit()
68
+ except Exception as e:
69
+ logger.warning("put_user_identity failed: %s", e)
topos/sync/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Sync layer for Topos."""
2
+
3
+ from .client import SyncClient
4
+
5
+ __all__ = ["SyncClient"]