topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,303 @@
1
+ """Flat tables for browser plugin data: one row per event, one column per field.
2
+
3
+ Stored in SQLite with explicit columns so DuckDB (or any SQL engine) can query
4
+ without parsing JSON. Raw format = one row per event; good rows = flat columns
5
+ for analytics (e.g. SELECT url, visited_at, hostname FROM browser_visits).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ from typing import Any, Dict, Optional
13
+
14
+ logger = logging.getLogger("topos.storage.raw.browser_flat_tables")
15
+
16
+ BROWSER_VISITS_TABLE = "browser_visits"
17
+ BROWSER_EVENTS_TABLE = "browser_events"
18
+ BROWSER_URL_CLASSIFICATION_TABLE = "browser_url_classification"
19
+ # Normalized raw retention table for browser_visits (architecture layer).
20
+ RAW_BROWSER_VISITS_TABLE = "raw_chat_messages_browservisits"
21
+
22
+
23
+ def _to_sql_value(val: Any) -> Optional[str]:
24
+ """Convert Python value to SQL-friendly value (string or None)."""
25
+ if val is None:
26
+ return None
27
+ if isinstance(val, bool):
28
+ return "1" if val else "0"
29
+ if isinstance(val, (dict, list)):
30
+ return json.dumps(val, ensure_ascii=False)
31
+ return str(val)
32
+
33
+
34
+ def ensure_browser_visits_table(conn) -> None:
35
+ """Create browser_visits table with flat columns if not exists."""
36
+ conn.execute(f"""
37
+ CREATE TABLE IF NOT EXISTS {BROWSER_VISITS_TABLE} (
38
+ record_id TEXT PRIMARY KEY,
39
+ dataset_id TEXT,
40
+ url TEXT NOT NULL,
41
+ visited_at TEXT NOT NULL,
42
+ title TEXT,
43
+ favicon_url TEXT,
44
+ hostname TEXT,
45
+ device_name TEXT,
46
+ tab_id INTEGER,
47
+ window_id INTEGER,
48
+ incognito INTEGER,
49
+ transition_type TEXT,
50
+ pinned INTEGER,
51
+ audible INTEGER,
52
+ muted INTEGER,
53
+ opener_tab_id INTEGER,
54
+ referred_by TEXT,
55
+ created_at TEXT DEFAULT (datetime('now'))
56
+ )
57
+ """)
58
+ conn.execute(f"""
59
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_visited_at
60
+ ON {BROWSER_VISITS_TABLE}(visited_at)
61
+ """)
62
+ conn.execute(f"""
63
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_hostname
64
+ ON {BROWSER_VISITS_TABLE}(hostname)
65
+ """)
66
+ conn.execute(f"""
67
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_transition_type
68
+ ON {BROWSER_VISITS_TABLE}(transition_type)
69
+ """)
70
+ conn.commit()
71
+ logger.debug("Ensured table %s exists", BROWSER_VISITS_TABLE)
72
+
73
+
74
+ def ensure_browser_events_table(conn) -> None:
75
+ """Create browser_events table with flat columns if not exists."""
76
+ conn.execute(f"""
77
+ CREATE TABLE IF NOT EXISTS {BROWSER_EVENTS_TABLE} (
78
+ record_id TEXT PRIMARY KEY,
79
+ dataset_id TEXT,
80
+ event_type TEXT NOT NULL,
81
+ url TEXT,
82
+ visited_at TEXT,
83
+ title TEXT,
84
+ favicon_url TEXT,
85
+ hostname TEXT,
86
+ device_name TEXT,
87
+ transition_type TEXT,
88
+ content TEXT,
89
+ tab_id INTEGER,
90
+ window_id INTEGER,
91
+ incognito INTEGER,
92
+ pinned INTEGER,
93
+ audible INTEGER,
94
+ muted INTEGER,
95
+ opener_tab_id INTEGER,
96
+ starred_at TEXT,
97
+ created_at TEXT DEFAULT (datetime('now'))
98
+ )
99
+ """)
100
+ conn.execute(f"""
101
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_EVENTS_TABLE}_event_type
102
+ ON {BROWSER_EVENTS_TABLE}(event_type)
103
+ """)
104
+ conn.execute(f"""
105
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_EVENTS_TABLE}_visited_at
106
+ ON {BROWSER_EVENTS_TABLE}(visited_at)
107
+ """)
108
+ conn.commit()
109
+ logger.debug("Ensured table %s exists", BROWSER_EVENTS_TABLE)
110
+
111
+
112
+ def ensure_browser_url_classification_table(conn) -> None:
113
+ """Create browser URL classification table for enrichment output. Stage 9: enriched_from_table."""
114
+ conn.execute(f"""
115
+ CREATE TABLE IF NOT EXISTS {BROWSER_URL_CLASSIFICATION_TABLE} (
116
+ enriched_from_table TEXT NOT NULL,
117
+ record_id TEXT NOT NULL,
118
+ dataset_id TEXT,
119
+ url TEXT NOT NULL,
120
+ title TEXT,
121
+ url_category TEXT,
122
+ url_confidence REAL,
123
+ model_name TEXT,
124
+ created_at TEXT DEFAULT (datetime('now')),
125
+ updated_at TEXT DEFAULT (datetime('now')),
126
+ PRIMARY KEY (enriched_from_table, record_id)
127
+ )
128
+ """)
129
+ conn.execute(f"""
130
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_URL_CLASSIFICATION_TABLE}_category
131
+ ON {BROWSER_URL_CLASSIFICATION_TABLE}(url_category)
132
+ """)
133
+ conn.execute(f"""
134
+ CREATE INDEX IF NOT EXISTS idx_{BROWSER_URL_CLASSIFICATION_TABLE}_dataset
135
+ ON {BROWSER_URL_CLASSIFICATION_TABLE}(dataset_id)
136
+ """)
137
+ conn.commit()
138
+ logger.debug("Ensured table %s exists", BROWSER_URL_CLASSIFICATION_TABLE)
139
+
140
+
141
+ def backfill_browser_visits_from_raw_retention(conn) -> int:
142
+ """Copy visits from raw retention into browser_visits when the flat table is empty.
143
+
144
+ Fresh Topos installs ingest into raw_chat_messages_browservisits first; this
145
+ one-time catch-up makes browser_visits visible in Data Explorer without manual SQL.
146
+ """
147
+ ensure_browser_visits_table(conn)
148
+ flat_count = conn.execute(f"SELECT COUNT(*) FROM {BROWSER_VISITS_TABLE}").fetchone()[0]
149
+ if flat_count:
150
+ return 0
151
+ raw_exists = conn.execute(
152
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?",
153
+ (RAW_BROWSER_VISITS_TABLE,),
154
+ ).fetchone()
155
+ if not raw_exists:
156
+ return 0
157
+ raw_count = conn.execute(f"SELECT COUNT(*) FROM {RAW_BROWSER_VISITS_TABLE}").fetchone()[0]
158
+ if not raw_count:
159
+ return 0
160
+ conn.execute(f"""
161
+ INSERT OR REPLACE INTO {BROWSER_VISITS_TABLE}
162
+ (record_id, dataset_id, url, visited_at, title, favicon_url, hostname, device_name,
163
+ tab_id, window_id, incognito, transition_type, pinned, audible, muted, opener_tab_id, referred_by)
164
+ SELECT
165
+ COALESCE(NULLIF(TRIM(record_id), ''), source_record_id),
166
+ dataset_id,
167
+ url,
168
+ visited_at,
169
+ title,
170
+ favicon_url,
171
+ hostname,
172
+ device_name,
173
+ tab_id,
174
+ window_id,
175
+ incognito,
176
+ transition_type,
177
+ pinned,
178
+ audible,
179
+ muted,
180
+ opener_tab_id,
181
+ referred_by
182
+ FROM {RAW_BROWSER_VISITS_TABLE}
183
+ WHERE url IS NOT NULL AND TRIM(url) != ''
184
+ """)
185
+ conn.commit()
186
+ copied = conn.execute(f"SELECT COUNT(*) FROM {BROWSER_VISITS_TABLE}").fetchone()[0]
187
+ logger.info(
188
+ "[PIPELINE:RAW] Backfilled %d browser visit row(s) from %s into %s",
189
+ copied,
190
+ RAW_BROWSER_VISITS_TABLE,
191
+ BROWSER_VISITS_TABLE,
192
+ )
193
+ return int(copied)
194
+
195
+
196
+ def write_browser_visit(conn, payload: Dict[str, Any]) -> None:
197
+ """Insert or replace one row in browser_visits (flat columns)."""
198
+ ensure_browser_visits_table(conn)
199
+ record_id = payload.get("record_id") or ""
200
+ url = _to_sql_value(payload.get("url")) or ""
201
+ visited_at = _to_sql_value(payload.get("visited_at")) or ""
202
+ conn.execute(f"""
203
+ INSERT OR REPLACE INTO {BROWSER_VISITS_TABLE}
204
+ (record_id, dataset_id, url, visited_at, title, favicon_url, hostname, device_name,
205
+ tab_id, window_id, incognito, transition_type, pinned, audible, muted, opener_tab_id, referred_by)
206
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
207
+ """, (
208
+ record_id,
209
+ _to_sql_value(payload.get("dataset_id")),
210
+ url,
211
+ visited_at,
212
+ _to_sql_value(payload.get("title")),
213
+ _to_sql_value(payload.get("favicon_url")),
214
+ _to_sql_value(payload.get("hostname")),
215
+ _to_sql_value(payload.get("device_name")),
216
+ payload.get("tab_id") if isinstance(payload.get("tab_id"), (int, type(None))) else None,
217
+ payload.get("window_id") if isinstance(payload.get("window_id"), (int, type(None))) else None,
218
+ 1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
219
+ _to_sql_value(payload.get("transition_type")),
220
+ 1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
221
+ 1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
222
+ 1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
223
+ payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), (int, type(None))) else None,
224
+ _to_sql_value(payload.get("referred_by")),
225
+ ))
226
+ conn.commit()
227
+ logger.debug("[PIPELINE:RAW] Wrote flat row to %s: record_id=%s", BROWSER_VISITS_TABLE, record_id[:24] if record_id else None)
228
+
229
+
230
+ def write_browser_event(conn, payload: Dict[str, Any]) -> None:
231
+ """Insert or replace one row in browser_events (flat columns)."""
232
+ ensure_browser_events_table(conn)
233
+ record_id = payload.get("record_id") or ""
234
+ event_type = _to_sql_value(payload.get("event_type")) or "unknown"
235
+ conn.execute(f"""
236
+ INSERT OR REPLACE INTO {BROWSER_EVENTS_TABLE}
237
+ (record_id, dataset_id, event_type, url, visited_at, title, favicon_url, hostname, device_name,
238
+ transition_type, content, tab_id, window_id, incognito, pinned, audible, muted, opener_tab_id, starred_at)
239
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
240
+ """, (
241
+ record_id,
242
+ _to_sql_value(payload.get("dataset_id")),
243
+ event_type,
244
+ _to_sql_value(payload.get("url")),
245
+ _to_sql_value(payload.get("visited_at")),
246
+ _to_sql_value(payload.get("title")),
247
+ _to_sql_value(payload.get("favicon_url")),
248
+ _to_sql_value(payload.get("hostname")),
249
+ _to_sql_value(payload.get("device_name")),
250
+ _to_sql_value(payload.get("transition_type")),
251
+ _to_sql_value(payload.get("content")),
252
+ payload.get("tab_id") if isinstance(payload.get("tab_id"), (int, type(None))) else None,
253
+ payload.get("window_id") if isinstance(payload.get("window_id"), (int, type(None))) else None,
254
+ 1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
255
+ 1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
256
+ 1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
257
+ 1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
258
+ payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), (int, type(None))) else None,
259
+ _to_sql_value(payload.get("starred_at")),
260
+ ))
261
+ conn.commit()
262
+ logger.debug("[PIPELINE:RAW] Wrote flat row to %s: record_id=%s event_type=%s", BROWSER_EVENTS_TABLE, record_id[:24] if record_id else None, event_type)
263
+
264
+
265
+ def write_browser_url_classification(
266
+ conn,
267
+ *,
268
+ source_table: str,
269
+ record_id: str,
270
+ dataset_id: Optional[str],
271
+ url: str,
272
+ title: Optional[str],
273
+ category: Optional[str],
274
+ confidence: Optional[float],
275
+ model_name: Optional[str],
276
+ ensure_table: bool = True,
277
+ log_write: bool = True,
278
+ ) -> None:
279
+ """Insert or replace one URL classification enrichment row."""
280
+ if ensure_table:
281
+ ensure_browser_url_classification_table(conn)
282
+ conn.execute(f"""
283
+ INSERT OR REPLACE INTO {BROWSER_URL_CLASSIFICATION_TABLE}
284
+ (enriched_from_table, record_id, dataset_id, url, title, url_category, url_confidence, model_name, updated_at)
285
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
286
+ """, (
287
+ _to_sql_value(source_table),
288
+ _to_sql_value(record_id),
289
+ _to_sql_value(dataset_id),
290
+ _to_sql_value(url),
291
+ _to_sql_value(title),
292
+ _to_sql_value(category),
293
+ confidence,
294
+ _to_sql_value(model_name),
295
+ ))
296
+ conn.commit()
297
+ if log_write:
298
+ logger.debug(
299
+ "[PIPELINE:RAW] Wrote URL classification row: source=%s record_id=%s category=%s",
300
+ source_table,
301
+ record_id[:24] if record_id else None,
302
+ category,
303
+ )
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import shutil
5
+ import json
6
+ import os
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from .raw_store import RawFile, RawFileRef
12
+
13
+ logger = logging.getLogger("topos.storage.raw.file_store")
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class RawFileStore:
18
+ base_path: Path
19
+
20
+ def __init__(self, base_path: Optional[Path] = None):
21
+ env_override = os.getenv("TOPOS_INGESTION_BASE_PATH")
22
+ resolved_base = base_path or (Path(env_override) if env_override else Path.home() / ".topos" / "ingestion")
23
+ object.__setattr__(self, "base_path", resolved_base)
24
+ self.base_path.mkdir(parents=True, exist_ok=True)
25
+
26
+ def get_file_path(self, dataset_id: str, schema_id: str) -> Path:
27
+ safe_dataset_id = dataset_id.replace(":", "_").replace("/", "_")
28
+ safe_schema_id = schema_id.replace(".", "_").replace("/", "_")
29
+ dataset_dir = self.base_path / safe_dataset_id
30
+ dataset_dir.mkdir(parents=True, exist_ok=True)
31
+ return dataset_dir / f"{safe_schema_id}.jsonl"
32
+
33
+ def write_file(self, raw_file: RawFile) -> RawFileRef:
34
+ destination = self.get_file_path(
35
+ raw_file.metadata.get("dataset_id", "unknown"),
36
+ raw_file.metadata.get("schema_id", "unknown"),
37
+ )
38
+ source_path = Path(raw_file.file_path)
39
+ if source_path.resolve() == destination.resolve():
40
+ return RawFileRef(file_id=destination.stem, file_path=str(destination))
41
+ if destination.exists():
42
+ backup = destination.with_suffix(".jsonl.backup")
43
+ shutil.copy2(destination, backup)
44
+ destination.parent.mkdir(parents=True, exist_ok=True)
45
+ shutil.copy2(raw_file.file_path, destination)
46
+ logger.info("Saved raw file: %s", destination)
47
+ return RawFileRef(file_id=destination.stem, file_path=str(destination))
48
+
49
+ def write_bytes(self, dataset_id: str, schema_id: str, payload: bytes) -> RawFileRef:
50
+ destination = self.get_file_path(dataset_id, schema_id)
51
+ if destination.exists():
52
+ backup = destination.with_suffix(".jsonl.backup")
53
+ shutil.copy2(destination, backup)
54
+ destination.parent.mkdir(parents=True, exist_ok=True)
55
+ destination.write_bytes(payload)
56
+ logger.info("Saved raw file bytes: %s", destination)
57
+ return RawFileRef(file_id=destination.stem, file_path=str(destination))
58
+
59
+ def append_record(self, dataset_id: str, schema_id: str, record: dict) -> RawFileRef:
60
+ destination = self.get_file_path(dataset_id, schema_id)
61
+ destination.parent.mkdir(parents=True, exist_ok=True)
62
+ with destination.open("a", encoding="utf-8") as handle:
63
+ handle.write(json.dumps(record))
64
+ handle.write("\n")
65
+ return RawFileRef(file_id=destination.stem, file_path=str(destination))
66
+
67
+ def list_datasets(self) -> list[dict]:
68
+ """List all datasets with their file stats."""
69
+ datasets = []
70
+ if not self.base_path.exists():
71
+ return datasets
72
+ for dataset_dir in self.base_path.iterdir():
73
+ if not dataset_dir.is_dir():
74
+ continue
75
+ dataset_id = dataset_dir.name.replace("_", ":")
76
+ total_size = 0
77
+ message_count = 0
78
+ schemas = []
79
+ for file_path in dataset_dir.glob("*.jsonl"):
80
+ if file_path.name.endswith(".backup"):
81
+ continue
82
+ file_size = file_path.stat().st_size
83
+ total_size += file_size
84
+ schema_id = file_path.stem.replace("_", ".")
85
+ # Count messages in file
86
+ try:
87
+ with file_path.open("r", encoding="utf-8") as f:
88
+ for _ in f:
89
+ message_count += 1
90
+ except Exception:
91
+ pass
92
+ schemas.append({"schema_id": schema_id, "file_size": file_size})
93
+ if total_size > 0 or message_count > 0:
94
+ datasets.append({
95
+ "dataset_id": dataset_id,
96
+ "total_size": total_size,
97
+ "message_count": message_count,
98
+ "schemas": schemas,
99
+ })
100
+ return datasets
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class RawFile:
9
+ file_path: str
10
+ metadata: Dict[str, str]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class RawFileRef:
15
+ file_id: str
16
+ file_path: str
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class RawRecordRef:
21
+ record_id: str
22
+
23
+
24
+ class RawStore:
25
+ def write_file(self, file: RawFile) -> RawFileRef:
26
+ raise NotImplementedError
27
+
28
+ def write_record(self, record: Dict[str, str]) -> RawRecordRef:
29
+ raise NotImplementedError