topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Flat tables for browser plugin data: one row per event, one column per field.
|
|
2
|
+
|
|
3
|
+
Stored in SQLite with explicit columns so DuckDB (or any SQL engine) can query
|
|
4
|
+
without parsing JSON. Raw format = one row per event; good rows = flat columns
|
|
5
|
+
for analytics (e.g. SELECT url, visited_at, hostname FROM browser_visits).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.storage.raw.browser_flat_tables")
|
|
15
|
+
|
|
16
|
+
BROWSER_VISITS_TABLE = "browser_visits"
|
|
17
|
+
BROWSER_EVENTS_TABLE = "browser_events"
|
|
18
|
+
BROWSER_URL_CLASSIFICATION_TABLE = "browser_url_classification"
|
|
19
|
+
# Normalized raw retention table for browser_visits (architecture layer).
|
|
20
|
+
RAW_BROWSER_VISITS_TABLE = "raw_chat_messages_browservisits"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _to_sql_value(val: Any) -> Optional[str]:
|
|
24
|
+
"""Convert Python value to SQL-friendly value (string or None)."""
|
|
25
|
+
if val is None:
|
|
26
|
+
return None
|
|
27
|
+
if isinstance(val, bool):
|
|
28
|
+
return "1" if val else "0"
|
|
29
|
+
if isinstance(val, (dict, list)):
|
|
30
|
+
return json.dumps(val, ensure_ascii=False)
|
|
31
|
+
return str(val)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def ensure_browser_visits_table(conn) -> None:
|
|
35
|
+
"""Create browser_visits table with flat columns if not exists."""
|
|
36
|
+
conn.execute(f"""
|
|
37
|
+
CREATE TABLE IF NOT EXISTS {BROWSER_VISITS_TABLE} (
|
|
38
|
+
record_id TEXT PRIMARY KEY,
|
|
39
|
+
dataset_id TEXT,
|
|
40
|
+
url TEXT NOT NULL,
|
|
41
|
+
visited_at TEXT NOT NULL,
|
|
42
|
+
title TEXT,
|
|
43
|
+
favicon_url TEXT,
|
|
44
|
+
hostname TEXT,
|
|
45
|
+
device_name TEXT,
|
|
46
|
+
tab_id INTEGER,
|
|
47
|
+
window_id INTEGER,
|
|
48
|
+
incognito INTEGER,
|
|
49
|
+
transition_type TEXT,
|
|
50
|
+
pinned INTEGER,
|
|
51
|
+
audible INTEGER,
|
|
52
|
+
muted INTEGER,
|
|
53
|
+
opener_tab_id INTEGER,
|
|
54
|
+
referred_by TEXT,
|
|
55
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
56
|
+
)
|
|
57
|
+
""")
|
|
58
|
+
conn.execute(f"""
|
|
59
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_visited_at
|
|
60
|
+
ON {BROWSER_VISITS_TABLE}(visited_at)
|
|
61
|
+
""")
|
|
62
|
+
conn.execute(f"""
|
|
63
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_hostname
|
|
64
|
+
ON {BROWSER_VISITS_TABLE}(hostname)
|
|
65
|
+
""")
|
|
66
|
+
conn.execute(f"""
|
|
67
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_VISITS_TABLE}_transition_type
|
|
68
|
+
ON {BROWSER_VISITS_TABLE}(transition_type)
|
|
69
|
+
""")
|
|
70
|
+
conn.commit()
|
|
71
|
+
logger.debug("Ensured table %s exists", BROWSER_VISITS_TABLE)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def ensure_browser_events_table(conn) -> None:
|
|
75
|
+
"""Create browser_events table with flat columns if not exists."""
|
|
76
|
+
conn.execute(f"""
|
|
77
|
+
CREATE TABLE IF NOT EXISTS {BROWSER_EVENTS_TABLE} (
|
|
78
|
+
record_id TEXT PRIMARY KEY,
|
|
79
|
+
dataset_id TEXT,
|
|
80
|
+
event_type TEXT NOT NULL,
|
|
81
|
+
url TEXT,
|
|
82
|
+
visited_at TEXT,
|
|
83
|
+
title TEXT,
|
|
84
|
+
favicon_url TEXT,
|
|
85
|
+
hostname TEXT,
|
|
86
|
+
device_name TEXT,
|
|
87
|
+
transition_type TEXT,
|
|
88
|
+
content TEXT,
|
|
89
|
+
tab_id INTEGER,
|
|
90
|
+
window_id INTEGER,
|
|
91
|
+
incognito INTEGER,
|
|
92
|
+
pinned INTEGER,
|
|
93
|
+
audible INTEGER,
|
|
94
|
+
muted INTEGER,
|
|
95
|
+
opener_tab_id INTEGER,
|
|
96
|
+
starred_at TEXT,
|
|
97
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
98
|
+
)
|
|
99
|
+
""")
|
|
100
|
+
conn.execute(f"""
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_EVENTS_TABLE}_event_type
|
|
102
|
+
ON {BROWSER_EVENTS_TABLE}(event_type)
|
|
103
|
+
""")
|
|
104
|
+
conn.execute(f"""
|
|
105
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_EVENTS_TABLE}_visited_at
|
|
106
|
+
ON {BROWSER_EVENTS_TABLE}(visited_at)
|
|
107
|
+
""")
|
|
108
|
+
conn.commit()
|
|
109
|
+
logger.debug("Ensured table %s exists", BROWSER_EVENTS_TABLE)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def ensure_browser_url_classification_table(conn) -> None:
|
|
113
|
+
"""Create browser URL classification table for enrichment output. Stage 9: enriched_from_table."""
|
|
114
|
+
conn.execute(f"""
|
|
115
|
+
CREATE TABLE IF NOT EXISTS {BROWSER_URL_CLASSIFICATION_TABLE} (
|
|
116
|
+
enriched_from_table TEXT NOT NULL,
|
|
117
|
+
record_id TEXT NOT NULL,
|
|
118
|
+
dataset_id TEXT,
|
|
119
|
+
url TEXT NOT NULL,
|
|
120
|
+
title TEXT,
|
|
121
|
+
url_category TEXT,
|
|
122
|
+
url_confidence REAL,
|
|
123
|
+
model_name TEXT,
|
|
124
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
125
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
126
|
+
PRIMARY KEY (enriched_from_table, record_id)
|
|
127
|
+
)
|
|
128
|
+
""")
|
|
129
|
+
conn.execute(f"""
|
|
130
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_URL_CLASSIFICATION_TABLE}_category
|
|
131
|
+
ON {BROWSER_URL_CLASSIFICATION_TABLE}(url_category)
|
|
132
|
+
""")
|
|
133
|
+
conn.execute(f"""
|
|
134
|
+
CREATE INDEX IF NOT EXISTS idx_{BROWSER_URL_CLASSIFICATION_TABLE}_dataset
|
|
135
|
+
ON {BROWSER_URL_CLASSIFICATION_TABLE}(dataset_id)
|
|
136
|
+
""")
|
|
137
|
+
conn.commit()
|
|
138
|
+
logger.debug("Ensured table %s exists", BROWSER_URL_CLASSIFICATION_TABLE)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def backfill_browser_visits_from_raw_retention(conn) -> int:
|
|
142
|
+
"""Copy visits from raw retention into browser_visits when the flat table is empty.
|
|
143
|
+
|
|
144
|
+
Fresh Topos installs ingest into raw_chat_messages_browservisits first; this
|
|
145
|
+
one-time catch-up makes browser_visits visible in Data Explorer without manual SQL.
|
|
146
|
+
"""
|
|
147
|
+
ensure_browser_visits_table(conn)
|
|
148
|
+
flat_count = conn.execute(f"SELECT COUNT(*) FROM {BROWSER_VISITS_TABLE}").fetchone()[0]
|
|
149
|
+
if flat_count:
|
|
150
|
+
return 0
|
|
151
|
+
raw_exists = conn.execute(
|
|
152
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=?",
|
|
153
|
+
(RAW_BROWSER_VISITS_TABLE,),
|
|
154
|
+
).fetchone()
|
|
155
|
+
if not raw_exists:
|
|
156
|
+
return 0
|
|
157
|
+
raw_count = conn.execute(f"SELECT COUNT(*) FROM {RAW_BROWSER_VISITS_TABLE}").fetchone()[0]
|
|
158
|
+
if not raw_count:
|
|
159
|
+
return 0
|
|
160
|
+
conn.execute(f"""
|
|
161
|
+
INSERT OR REPLACE INTO {BROWSER_VISITS_TABLE}
|
|
162
|
+
(record_id, dataset_id, url, visited_at, title, favicon_url, hostname, device_name,
|
|
163
|
+
tab_id, window_id, incognito, transition_type, pinned, audible, muted, opener_tab_id, referred_by)
|
|
164
|
+
SELECT
|
|
165
|
+
COALESCE(NULLIF(TRIM(record_id), ''), source_record_id),
|
|
166
|
+
dataset_id,
|
|
167
|
+
url,
|
|
168
|
+
visited_at,
|
|
169
|
+
title,
|
|
170
|
+
favicon_url,
|
|
171
|
+
hostname,
|
|
172
|
+
device_name,
|
|
173
|
+
tab_id,
|
|
174
|
+
window_id,
|
|
175
|
+
incognito,
|
|
176
|
+
transition_type,
|
|
177
|
+
pinned,
|
|
178
|
+
audible,
|
|
179
|
+
muted,
|
|
180
|
+
opener_tab_id,
|
|
181
|
+
referred_by
|
|
182
|
+
FROM {RAW_BROWSER_VISITS_TABLE}
|
|
183
|
+
WHERE url IS NOT NULL AND TRIM(url) != ''
|
|
184
|
+
""")
|
|
185
|
+
conn.commit()
|
|
186
|
+
copied = conn.execute(f"SELECT COUNT(*) FROM {BROWSER_VISITS_TABLE}").fetchone()[0]
|
|
187
|
+
logger.info(
|
|
188
|
+
"[PIPELINE:RAW] Backfilled %d browser visit row(s) from %s into %s",
|
|
189
|
+
copied,
|
|
190
|
+
RAW_BROWSER_VISITS_TABLE,
|
|
191
|
+
BROWSER_VISITS_TABLE,
|
|
192
|
+
)
|
|
193
|
+
return int(copied)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def write_browser_visit(conn, payload: Dict[str, Any]) -> None:
|
|
197
|
+
"""Insert or replace one row in browser_visits (flat columns)."""
|
|
198
|
+
ensure_browser_visits_table(conn)
|
|
199
|
+
record_id = payload.get("record_id") or ""
|
|
200
|
+
url = _to_sql_value(payload.get("url")) or ""
|
|
201
|
+
visited_at = _to_sql_value(payload.get("visited_at")) or ""
|
|
202
|
+
conn.execute(f"""
|
|
203
|
+
INSERT OR REPLACE INTO {BROWSER_VISITS_TABLE}
|
|
204
|
+
(record_id, dataset_id, url, visited_at, title, favicon_url, hostname, device_name,
|
|
205
|
+
tab_id, window_id, incognito, transition_type, pinned, audible, muted, opener_tab_id, referred_by)
|
|
206
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
207
|
+
""", (
|
|
208
|
+
record_id,
|
|
209
|
+
_to_sql_value(payload.get("dataset_id")),
|
|
210
|
+
url,
|
|
211
|
+
visited_at,
|
|
212
|
+
_to_sql_value(payload.get("title")),
|
|
213
|
+
_to_sql_value(payload.get("favicon_url")),
|
|
214
|
+
_to_sql_value(payload.get("hostname")),
|
|
215
|
+
_to_sql_value(payload.get("device_name")),
|
|
216
|
+
payload.get("tab_id") if isinstance(payload.get("tab_id"), (int, type(None))) else None,
|
|
217
|
+
payload.get("window_id") if isinstance(payload.get("window_id"), (int, type(None))) else None,
|
|
218
|
+
1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
|
|
219
|
+
_to_sql_value(payload.get("transition_type")),
|
|
220
|
+
1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
|
|
221
|
+
1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
|
|
222
|
+
1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
|
|
223
|
+
payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), (int, type(None))) else None,
|
|
224
|
+
_to_sql_value(payload.get("referred_by")),
|
|
225
|
+
))
|
|
226
|
+
conn.commit()
|
|
227
|
+
logger.debug("[PIPELINE:RAW] Wrote flat row to %s: record_id=%s", BROWSER_VISITS_TABLE, record_id[:24] if record_id else None)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def write_browser_event(conn, payload: Dict[str, Any]) -> None:
|
|
231
|
+
"""Insert or replace one row in browser_events (flat columns)."""
|
|
232
|
+
ensure_browser_events_table(conn)
|
|
233
|
+
record_id = payload.get("record_id") or ""
|
|
234
|
+
event_type = _to_sql_value(payload.get("event_type")) or "unknown"
|
|
235
|
+
conn.execute(f"""
|
|
236
|
+
INSERT OR REPLACE INTO {BROWSER_EVENTS_TABLE}
|
|
237
|
+
(record_id, dataset_id, event_type, url, visited_at, title, favicon_url, hostname, device_name,
|
|
238
|
+
transition_type, content, tab_id, window_id, incognito, pinned, audible, muted, opener_tab_id, starred_at)
|
|
239
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
240
|
+
""", (
|
|
241
|
+
record_id,
|
|
242
|
+
_to_sql_value(payload.get("dataset_id")),
|
|
243
|
+
event_type,
|
|
244
|
+
_to_sql_value(payload.get("url")),
|
|
245
|
+
_to_sql_value(payload.get("visited_at")),
|
|
246
|
+
_to_sql_value(payload.get("title")),
|
|
247
|
+
_to_sql_value(payload.get("favicon_url")),
|
|
248
|
+
_to_sql_value(payload.get("hostname")),
|
|
249
|
+
_to_sql_value(payload.get("device_name")),
|
|
250
|
+
_to_sql_value(payload.get("transition_type")),
|
|
251
|
+
_to_sql_value(payload.get("content")),
|
|
252
|
+
payload.get("tab_id") if isinstance(payload.get("tab_id"), (int, type(None))) else None,
|
|
253
|
+
payload.get("window_id") if isinstance(payload.get("window_id"), (int, type(None))) else None,
|
|
254
|
+
1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
|
|
255
|
+
1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
|
|
256
|
+
1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
|
|
257
|
+
1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
|
|
258
|
+
payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), (int, type(None))) else None,
|
|
259
|
+
_to_sql_value(payload.get("starred_at")),
|
|
260
|
+
))
|
|
261
|
+
conn.commit()
|
|
262
|
+
logger.debug("[PIPELINE:RAW] Wrote flat row to %s: record_id=%s event_type=%s", BROWSER_EVENTS_TABLE, record_id[:24] if record_id else None, event_type)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def write_browser_url_classification(
|
|
266
|
+
conn,
|
|
267
|
+
*,
|
|
268
|
+
source_table: str,
|
|
269
|
+
record_id: str,
|
|
270
|
+
dataset_id: Optional[str],
|
|
271
|
+
url: str,
|
|
272
|
+
title: Optional[str],
|
|
273
|
+
category: Optional[str],
|
|
274
|
+
confidence: Optional[float],
|
|
275
|
+
model_name: Optional[str],
|
|
276
|
+
ensure_table: bool = True,
|
|
277
|
+
log_write: bool = True,
|
|
278
|
+
) -> None:
|
|
279
|
+
"""Insert or replace one URL classification enrichment row."""
|
|
280
|
+
if ensure_table:
|
|
281
|
+
ensure_browser_url_classification_table(conn)
|
|
282
|
+
conn.execute(f"""
|
|
283
|
+
INSERT OR REPLACE INTO {BROWSER_URL_CLASSIFICATION_TABLE}
|
|
284
|
+
(enriched_from_table, record_id, dataset_id, url, title, url_category, url_confidence, model_name, updated_at)
|
|
285
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
|
|
286
|
+
""", (
|
|
287
|
+
_to_sql_value(source_table),
|
|
288
|
+
_to_sql_value(record_id),
|
|
289
|
+
_to_sql_value(dataset_id),
|
|
290
|
+
_to_sql_value(url),
|
|
291
|
+
_to_sql_value(title),
|
|
292
|
+
_to_sql_value(category),
|
|
293
|
+
confidence,
|
|
294
|
+
_to_sql_value(model_name),
|
|
295
|
+
))
|
|
296
|
+
conn.commit()
|
|
297
|
+
if log_write:
|
|
298
|
+
logger.debug(
|
|
299
|
+
"[PIPELINE:RAW] Wrote URL classification row: source=%s record_id=%s category=%s",
|
|
300
|
+
source_table,
|
|
301
|
+
record_id[:24] if record_id else None,
|
|
302
|
+
category,
|
|
303
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .raw_store import RawFile, RawFileRef
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("topos.storage.raw.file_store")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class RawFileStore:
|
|
18
|
+
base_path: Path
|
|
19
|
+
|
|
20
|
+
def __init__(self, base_path: Optional[Path] = None):
|
|
21
|
+
env_override = os.getenv("TOPOS_INGESTION_BASE_PATH")
|
|
22
|
+
resolved_base = base_path or (Path(env_override) if env_override else Path.home() / ".topos" / "ingestion")
|
|
23
|
+
object.__setattr__(self, "base_path", resolved_base)
|
|
24
|
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
def get_file_path(self, dataset_id: str, schema_id: str) -> Path:
|
|
27
|
+
safe_dataset_id = dataset_id.replace(":", "_").replace("/", "_")
|
|
28
|
+
safe_schema_id = schema_id.replace(".", "_").replace("/", "_")
|
|
29
|
+
dataset_dir = self.base_path / safe_dataset_id
|
|
30
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
return dataset_dir / f"{safe_schema_id}.jsonl"
|
|
32
|
+
|
|
33
|
+
def write_file(self, raw_file: RawFile) -> RawFileRef:
|
|
34
|
+
destination = self.get_file_path(
|
|
35
|
+
raw_file.metadata.get("dataset_id", "unknown"),
|
|
36
|
+
raw_file.metadata.get("schema_id", "unknown"),
|
|
37
|
+
)
|
|
38
|
+
source_path = Path(raw_file.file_path)
|
|
39
|
+
if source_path.resolve() == destination.resolve():
|
|
40
|
+
return RawFileRef(file_id=destination.stem, file_path=str(destination))
|
|
41
|
+
if destination.exists():
|
|
42
|
+
backup = destination.with_suffix(".jsonl.backup")
|
|
43
|
+
shutil.copy2(destination, backup)
|
|
44
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
shutil.copy2(raw_file.file_path, destination)
|
|
46
|
+
logger.info("Saved raw file: %s", destination)
|
|
47
|
+
return RawFileRef(file_id=destination.stem, file_path=str(destination))
|
|
48
|
+
|
|
49
|
+
def write_bytes(self, dataset_id: str, schema_id: str, payload: bytes) -> RawFileRef:
|
|
50
|
+
destination = self.get_file_path(dataset_id, schema_id)
|
|
51
|
+
if destination.exists():
|
|
52
|
+
backup = destination.with_suffix(".jsonl.backup")
|
|
53
|
+
shutil.copy2(destination, backup)
|
|
54
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
destination.write_bytes(payload)
|
|
56
|
+
logger.info("Saved raw file bytes: %s", destination)
|
|
57
|
+
return RawFileRef(file_id=destination.stem, file_path=str(destination))
|
|
58
|
+
|
|
59
|
+
def append_record(self, dataset_id: str, schema_id: str, record: dict) -> RawFileRef:
|
|
60
|
+
destination = self.get_file_path(dataset_id, schema_id)
|
|
61
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
with destination.open("a", encoding="utf-8") as handle:
|
|
63
|
+
handle.write(json.dumps(record))
|
|
64
|
+
handle.write("\n")
|
|
65
|
+
return RawFileRef(file_id=destination.stem, file_path=str(destination))
|
|
66
|
+
|
|
67
|
+
def list_datasets(self) -> list[dict]:
|
|
68
|
+
"""List all datasets with their file stats."""
|
|
69
|
+
datasets = []
|
|
70
|
+
if not self.base_path.exists():
|
|
71
|
+
return datasets
|
|
72
|
+
for dataset_dir in self.base_path.iterdir():
|
|
73
|
+
if not dataset_dir.is_dir():
|
|
74
|
+
continue
|
|
75
|
+
dataset_id = dataset_dir.name.replace("_", ":")
|
|
76
|
+
total_size = 0
|
|
77
|
+
message_count = 0
|
|
78
|
+
schemas = []
|
|
79
|
+
for file_path in dataset_dir.glob("*.jsonl"):
|
|
80
|
+
if file_path.name.endswith(".backup"):
|
|
81
|
+
continue
|
|
82
|
+
file_size = file_path.stat().st_size
|
|
83
|
+
total_size += file_size
|
|
84
|
+
schema_id = file_path.stem.replace("_", ".")
|
|
85
|
+
# Count messages in file
|
|
86
|
+
try:
|
|
87
|
+
with file_path.open("r", encoding="utf-8") as f:
|
|
88
|
+
for _ in f:
|
|
89
|
+
message_count += 1
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
schemas.append({"schema_id": schema_id, "file_size": file_size})
|
|
93
|
+
if total_size > 0 or message_count > 0:
|
|
94
|
+
datasets.append({
|
|
95
|
+
"dataset_id": dataset_id,
|
|
96
|
+
"total_size": total_size,
|
|
97
|
+
"message_count": message_count,
|
|
98
|
+
"schemas": schemas,
|
|
99
|
+
})
|
|
100
|
+
return datasets
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class RawFile:
|
|
9
|
+
file_path: str
|
|
10
|
+
metadata: Dict[str, str]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class RawFileRef:
|
|
15
|
+
file_id: str
|
|
16
|
+
file_path: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class RawRecordRef:
|
|
21
|
+
record_id: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RawStore:
|
|
25
|
+
def write_file(self, file: RawFile) -> RawFileRef:
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
|
|
28
|
+
def write_record(self, record: Dict[str, str]) -> RawRecordRef:
|
|
29
|
+
raise NotImplementedError
|