topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Raw tables manager for storing original payloads before canonicalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import sqlite3
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger("topos.storage.raw.raw_tables_manager")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RawTablesManager:
|
|
14
|
+
"""Manages raw retention tables for storing original payloads.
|
|
15
|
+
|
|
16
|
+
According to architecture, raw tables are per-connector:
|
|
17
|
+
- `raw_chat_messages_{source}` for chat sources
|
|
18
|
+
- `raw_{source}_events` for event sources
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, conn: sqlite3.Connection):
|
|
22
|
+
"""Initialize with database connection."""
|
|
23
|
+
self.conn = conn
|
|
24
|
+
|
|
25
|
+
def get_raw_table_name(self, source_id: str, source_type: str = "chat_messages") -> str:
|
|
26
|
+
"""Get raw table name for a source.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source_id: Source identifier (e.g., "chatgpt", "chatgpt_ui_conversation")
|
|
30
|
+
source_type: Type of data ("chat_messages", "events", etc.)
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Table name like "raw_chat_messages_chatgpt"
|
|
34
|
+
"""
|
|
35
|
+
# Extract base source name (remove prefixes like "dev_test_")
|
|
36
|
+
if source_id in ("browser_visits", "browser_events", "starred_websites"):
|
|
37
|
+
base_source = source_id.replace("_", "")
|
|
38
|
+
else:
|
|
39
|
+
base_source = source_id
|
|
40
|
+
if "_" in source_id:
|
|
41
|
+
# For "chatgpt_ui_conversation", extract "chatgpt"
|
|
42
|
+
parts = source_id.split("_")
|
|
43
|
+
# Find the actual source name (usually after prefixes)
|
|
44
|
+
for part in parts:
|
|
45
|
+
if part in ["chatgpt", "grok", "claude", "gemini"]:
|
|
46
|
+
base_source = part
|
|
47
|
+
break
|
|
48
|
+
# If no known source found, use the last meaningful part
|
|
49
|
+
if base_source == source_id:
|
|
50
|
+
# For "chatgpt_ui_conversation", use "chatgpt_ui_conversation"
|
|
51
|
+
# but normalize to just the source type
|
|
52
|
+
if "chatgpt" in source_id.lower():
|
|
53
|
+
base_source = "chatgpt"
|
|
54
|
+
elif "grok" in source_id.lower():
|
|
55
|
+
base_source = "grok"
|
|
56
|
+
else:
|
|
57
|
+
# Fallback: use a sanitized version
|
|
58
|
+
base_source = source_id.replace("dev_test_", "").replace("_", "")
|
|
59
|
+
|
|
60
|
+
if source_type == "chat_messages":
|
|
61
|
+
return f"raw_chat_messages_{base_source}"
|
|
62
|
+
else:
|
|
63
|
+
return f"raw_{base_source}_{source_type}"
|
|
64
|
+
|
|
65
|
+
def ensure_raw_table(self, table_name: str) -> None:
|
|
66
|
+
"""Ensure raw table exists with proper schema.
|
|
67
|
+
|
|
68
|
+
Raw tables store original payloads verbatim with:
|
|
69
|
+
- source_system: Source identifier
|
|
70
|
+
- source_record_id: Unique record ID within source
|
|
71
|
+
- payload_json: Original payload as JSON string
|
|
72
|
+
- created_at: Timestamp when record was stored
|
|
73
|
+
- Uniqueness: (source_system, source_record_id)
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
if table_name == "raw_chat_messages_browservisits":
|
|
77
|
+
self._ensure_browser_visits_raw_table(table_name)
|
|
78
|
+
return
|
|
79
|
+
self.conn.execute(f"""
|
|
80
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
81
|
+
source_system TEXT NOT NULL,
|
|
82
|
+
source_record_id TEXT NOT NULL,
|
|
83
|
+
payload_json TEXT NOT NULL,
|
|
84
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
85
|
+
PRIMARY KEY (source_system, source_record_id)
|
|
86
|
+
)
|
|
87
|
+
""")
|
|
88
|
+
|
|
89
|
+
# Create indexes
|
|
90
|
+
self.conn.execute(f"""
|
|
91
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_source_system
|
|
92
|
+
ON {table_name}(source_system)
|
|
93
|
+
""")
|
|
94
|
+
|
|
95
|
+
self.conn.execute(f"""
|
|
96
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_created_at
|
|
97
|
+
ON {table_name}(created_at)
|
|
98
|
+
""")
|
|
99
|
+
|
|
100
|
+
self.conn.commit()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
self.conn.rollback()
|
|
103
|
+
logger.error("Failed to ensure raw table %s: %s", table_name, e)
|
|
104
|
+
raise
|
|
105
|
+
|
|
106
|
+
def _ensure_browser_visits_raw_table(self, table_name: str) -> None:
|
|
107
|
+
"""Ensure browser visits raw table uses normalized columns (no payload_json)."""
|
|
108
|
+
cursor = self.conn.execute(
|
|
109
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
|
110
|
+
(table_name,),
|
|
111
|
+
)
|
|
112
|
+
table_exists = cursor.fetchone() is not None
|
|
113
|
+
|
|
114
|
+
def _create_schema(target_name: str) -> None:
|
|
115
|
+
self.conn.execute(f"""
|
|
116
|
+
CREATE TABLE IF NOT EXISTS {target_name} (
|
|
117
|
+
source_system TEXT NOT NULL,
|
|
118
|
+
source_record_id TEXT NOT NULL,
|
|
119
|
+
record_id TEXT,
|
|
120
|
+
dataset_id TEXT,
|
|
121
|
+
url TEXT,
|
|
122
|
+
visited_at TEXT,
|
|
123
|
+
title TEXT,
|
|
124
|
+
favicon_url TEXT,
|
|
125
|
+
hostname TEXT,
|
|
126
|
+
device_name TEXT,
|
|
127
|
+
tab_id INTEGER,
|
|
128
|
+
window_id INTEGER,
|
|
129
|
+
incognito INTEGER,
|
|
130
|
+
transition_type TEXT,
|
|
131
|
+
pinned INTEGER,
|
|
132
|
+
audible INTEGER,
|
|
133
|
+
muted INTEGER,
|
|
134
|
+
opener_tab_id INTEGER,
|
|
135
|
+
referred_by TEXT,
|
|
136
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
137
|
+
PRIMARY KEY (source_system, source_record_id)
|
|
138
|
+
)
|
|
139
|
+
""")
|
|
140
|
+
|
|
141
|
+
if not table_exists:
|
|
142
|
+
_create_schema(table_name)
|
|
143
|
+
else:
|
|
144
|
+
existing_cols_cursor = self.conn.execute(f"PRAGMA table_info({table_name})")
|
|
145
|
+
existing_cols = {row[1] for row in existing_cols_cursor.fetchall()}
|
|
146
|
+
needs_migration = "payload_json" in existing_cols or "url" not in existing_cols
|
|
147
|
+
if needs_migration:
|
|
148
|
+
pre_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
149
|
+
tmp_table = f"{table_name}__migrated"
|
|
150
|
+
self.conn.execute(f"DROP TABLE IF EXISTS {tmp_table}")
|
|
151
|
+
_create_schema(tmp_table)
|
|
152
|
+
self.conn.execute(f"""
|
|
153
|
+
INSERT OR REPLACE INTO {tmp_table} (
|
|
154
|
+
source_system, source_record_id, record_id, dataset_id, url, visited_at, title,
|
|
155
|
+
favicon_url, hostname, device_name, tab_id, window_id, incognito, transition_type,
|
|
156
|
+
pinned, audible, muted, opener_tab_id, referred_by, created_at
|
|
157
|
+
)
|
|
158
|
+
SELECT
|
|
159
|
+
source_system,
|
|
160
|
+
source_record_id,
|
|
161
|
+
COALESCE(json_extract(payload_json, '$.record_id'), source_record_id),
|
|
162
|
+
json_extract(payload_json, '$.dataset_id'),
|
|
163
|
+
json_extract(payload_json, '$.url'),
|
|
164
|
+
json_extract(payload_json, '$.visited_at'),
|
|
165
|
+
json_extract(payload_json, '$.title'),
|
|
166
|
+
json_extract(payload_json, '$.favicon_url'),
|
|
167
|
+
json_extract(payload_json, '$.hostname'),
|
|
168
|
+
json_extract(payload_json, '$.device_name'),
|
|
169
|
+
CAST(json_extract(payload_json, '$.tab_id') AS INTEGER),
|
|
170
|
+
CAST(json_extract(payload_json, '$.window_id') AS INTEGER),
|
|
171
|
+
CASE
|
|
172
|
+
WHEN json_extract(payload_json, '$.incognito') IN (1, '1', 'true', 'TRUE') THEN 1
|
|
173
|
+
WHEN json_extract(payload_json, '$.incognito') IN (0, '0', 'false', 'FALSE') THEN 0
|
|
174
|
+
ELSE NULL
|
|
175
|
+
END,
|
|
176
|
+
json_extract(payload_json, '$.transition_type'),
|
|
177
|
+
CASE
|
|
178
|
+
WHEN json_extract(payload_json, '$.pinned') IN (1, '1', 'true', 'TRUE') THEN 1
|
|
179
|
+
WHEN json_extract(payload_json, '$.pinned') IN (0, '0', 'false', 'FALSE') THEN 0
|
|
180
|
+
ELSE NULL
|
|
181
|
+
END,
|
|
182
|
+
CASE
|
|
183
|
+
WHEN json_extract(payload_json, '$.audible') IN (1, '1', 'true', 'TRUE') THEN 1
|
|
184
|
+
WHEN json_extract(payload_json, '$.audible') IN (0, '0', 'false', 'FALSE') THEN 0
|
|
185
|
+
ELSE NULL
|
|
186
|
+
END,
|
|
187
|
+
CASE
|
|
188
|
+
WHEN json_extract(payload_json, '$.muted') IN (1, '1', 'true', 'TRUE') THEN 1
|
|
189
|
+
WHEN json_extract(payload_json, '$.muted') IN (0, '0', 'false', 'FALSE') THEN 0
|
|
190
|
+
ELSE NULL
|
|
191
|
+
END,
|
|
192
|
+
CAST(json_extract(payload_json, '$.opener_tab_id') AS INTEGER),
|
|
193
|
+
json_extract(payload_json, '$.referred_by'),
|
|
194
|
+
COALESCE(created_at, datetime('now'))
|
|
195
|
+
FROM {table_name}
|
|
196
|
+
""")
|
|
197
|
+
self.conn.execute(f"DROP TABLE {table_name}")
|
|
198
|
+
self.conn.execute(f"ALTER TABLE {tmp_table} RENAME TO {table_name}")
|
|
199
|
+
post_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
200
|
+
logger.info(
|
|
201
|
+
"[PIPELINE:RAW] Migrated %s to normalized schema: rows_before=%d rows_after=%d",
|
|
202
|
+
table_name,
|
|
203
|
+
pre_count,
|
|
204
|
+
post_count,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
self.conn.execute(f"""
|
|
208
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_source_system
|
|
209
|
+
ON {table_name}(source_system)
|
|
210
|
+
""")
|
|
211
|
+
self.conn.execute(f"""
|
|
212
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_created_at
|
|
213
|
+
ON {table_name}(created_at)
|
|
214
|
+
""")
|
|
215
|
+
self.conn.execute(f"""
|
|
216
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_visited_at
|
|
217
|
+
ON {table_name}(visited_at)
|
|
218
|
+
""")
|
|
219
|
+
self.conn.execute(f"""
|
|
220
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_url
|
|
221
|
+
ON {table_name}(url)
|
|
222
|
+
""")
|
|
223
|
+
self.conn.commit()
|
|
224
|
+
|
|
225
|
+
def write_raw_record(
|
|
226
|
+
self,
|
|
227
|
+
source_id: str,
|
|
228
|
+
source_record_id: str,
|
|
229
|
+
payload: Dict[str, Any],
|
|
230
|
+
source_type: str = "chat_messages",
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Write raw record to raw table.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
source_id: Source identifier
|
|
236
|
+
source_record_id: Unique record ID within source
|
|
237
|
+
payload: Original payload dictionary
|
|
238
|
+
source_type: Type of data ("chat_messages", "events", etc.)
|
|
239
|
+
"""
|
|
240
|
+
table_name = self.get_raw_table_name(source_id, source_type)
|
|
241
|
+
self.ensure_raw_table(table_name)
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
if table_name == "raw_chat_messages_browservisits":
|
|
245
|
+
self.conn.execute(f"""
|
|
246
|
+
INSERT OR REPLACE INTO {table_name}
|
|
247
|
+
(
|
|
248
|
+
source_system, source_record_id, record_id, dataset_id, url, visited_at, title,
|
|
249
|
+
favicon_url, hostname, device_name, tab_id, window_id, incognito, transition_type,
|
|
250
|
+
pinned, audible, muted, opener_tab_id, referred_by, created_at
|
|
251
|
+
)
|
|
252
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
|
|
253
|
+
""", (
|
|
254
|
+
source_id,
|
|
255
|
+
source_record_id,
|
|
256
|
+
payload.get("record_id") or source_record_id,
|
|
257
|
+
payload.get("dataset_id"),
|
|
258
|
+
payload.get("url"),
|
|
259
|
+
payload.get("visited_at"),
|
|
260
|
+
payload.get("title"),
|
|
261
|
+
payload.get("favicon_url"),
|
|
262
|
+
payload.get("hostname"),
|
|
263
|
+
payload.get("device_name"),
|
|
264
|
+
payload.get("tab_id") if isinstance(payload.get("tab_id"), int) else None,
|
|
265
|
+
payload.get("window_id") if isinstance(payload.get("window_id"), int) else None,
|
|
266
|
+
1 if payload.get("incognito") is True else (0 if payload.get("incognito") is False else None),
|
|
267
|
+
payload.get("transition_type"),
|
|
268
|
+
1 if payload.get("pinned") is True else (0 if payload.get("pinned") is False else None),
|
|
269
|
+
1 if payload.get("audible") is True else (0 if payload.get("audible") is False else None),
|
|
270
|
+
1 if payload.get("muted") is True else (0 if payload.get("muted") is False else None),
|
|
271
|
+
payload.get("opener_tab_id") if isinstance(payload.get("opener_tab_id"), int) else None,
|
|
272
|
+
payload.get("referred_by"),
|
|
273
|
+
))
|
|
274
|
+
self.conn.commit()
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
# Store payload as JSON string
|
|
278
|
+
payload_json = json.dumps(payload, ensure_ascii=False)
|
|
279
|
+
|
|
280
|
+
self.conn.execute(f"""
|
|
281
|
+
INSERT OR REPLACE INTO {table_name}
|
|
282
|
+
(source_system, source_record_id, payload_json, created_at)
|
|
283
|
+
VALUES (?, ?, ?, datetime('now'))
|
|
284
|
+
""", (source_id, source_record_id, payload_json))
|
|
285
|
+
|
|
286
|
+
self.conn.commit()
|
|
287
|
+
except Exception as e:
|
|
288
|
+
self.conn.rollback()
|
|
289
|
+
logger.error(
|
|
290
|
+
"[PIPELINE:RAW] Failed to store raw record: source=%s, record_id=%s, error=%s",
|
|
291
|
+
source_id,
|
|
292
|
+
source_record_id,
|
|
293
|
+
e,
|
|
294
|
+
)
|
|
295
|
+
raise
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from .raw_store import RawRecordRef, RawStore
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SQLiteRawStore(RawStore):
|
|
9
|
+
def __init__(self, db_conn):
|
|
10
|
+
self.db_conn = db_conn
|
|
11
|
+
|
|
12
|
+
def write_file(self, file): # pragma: no cover - not used for sqlite raw store
|
|
13
|
+
raise NotImplementedError
|
|
14
|
+
|
|
15
|
+
def write_record(self, record: Dict[str, str]) -> RawRecordRef:
|
|
16
|
+
_ = record
|
|
17
|
+
raise NotImplementedError("SQLiteRawStore not implemented yet")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EncryptionManager:
|
|
5
|
+
"""No-op encryption manager placeholder."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, dataset_id: str, user_id: str):
|
|
8
|
+
self.dataset_id = dataset_id
|
|
9
|
+
self.user_id = user_id
|
|
10
|
+
self._crypto_version = 1
|
|
11
|
+
|
|
12
|
+
def get_crypto_version(self) -> int:
|
|
13
|
+
return self._crypto_version
|
|
14
|
+
|
|
15
|
+
def encrypt_str(self, payload: str, crypto_version: int | None = None) -> str:
|
|
16
|
+
_ = crypto_version
|
|
17
|
+
return payload
|
|
18
|
+
|
|
19
|
+
def decrypt_str(self, ciphertext: str, crypto_version: int | None = None) -> str:
|
|
20
|
+
_ = crypto_version
|
|
21
|
+
return ciphertext
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Signal identity storage: my_phone_number (and optional my_signal_id) per dataset.
|
|
2
|
+
|
|
3
|
+
Used to set sender_type (self vs contact) and owner when ingesting Signal messages.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("topos.storage.signal_identity")
|
|
12
|
+
|
|
13
|
+
TABLE = "signal_identity"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def ensure_table(conn) -> None:
|
|
17
|
+
"""Create signal_identity table if not exists."""
|
|
18
|
+
conn.execute(f"""
|
|
19
|
+
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
20
|
+
dataset_id TEXT NOT NULL PRIMARY KEY,
|
|
21
|
+
my_phone_number TEXT,
|
|
22
|
+
my_signal_id TEXT,
|
|
23
|
+
updated_at TEXT DEFAULT (datetime('now'))
|
|
24
|
+
)
|
|
25
|
+
""")
|
|
26
|
+
conn.commit()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_signal_identity(conn, dataset_id: str) -> Optional[dict]:
|
|
30
|
+
"""Return { my_phone_number, my_signal_id } for dataset_id, or None."""
|
|
31
|
+
if not conn or not dataset_id:
|
|
32
|
+
return None
|
|
33
|
+
try:
|
|
34
|
+
ensure_table(conn)
|
|
35
|
+
row = conn.execute(
|
|
36
|
+
f"SELECT my_phone_number, my_signal_id FROM {TABLE} WHERE dataset_id = ?",
|
|
37
|
+
(dataset_id,),
|
|
38
|
+
).fetchone()
|
|
39
|
+
if not row:
|
|
40
|
+
return None
|
|
41
|
+
return {"my_phone_number": row[0], "my_signal_id": row[1]}
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.warning("get_signal_identity failed: %s", e)
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def put_signal_identity(
|
|
48
|
+
conn,
|
|
49
|
+
dataset_id: str,
|
|
50
|
+
*,
|
|
51
|
+
my_phone_number: Optional[str] = None,
|
|
52
|
+
my_signal_id: Optional[str] = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Set Signal identity for dataset_id. Pass None to leave a field unchanged."""
|
|
55
|
+
if not conn or not dataset_id:
|
|
56
|
+
return
|
|
57
|
+
try:
|
|
58
|
+
ensure_table(conn)
|
|
59
|
+
existing = get_signal_identity(conn, dataset_id)
|
|
60
|
+
phone = my_phone_number if my_phone_number is not None else (existing.get("my_phone_number") if existing else None)
|
|
61
|
+
sid = my_signal_id if my_signal_id is not None else (existing.get("my_signal_id") if existing else None)
|
|
62
|
+
conn.execute(
|
|
63
|
+
f"""
|
|
64
|
+
INSERT OR REPLACE INTO {TABLE} (dataset_id, my_phone_number, my_signal_id, updated_at)
|
|
65
|
+
VALUES (?, ?, ?, datetime('now'))
|
|
66
|
+
""",
|
|
67
|
+
(dataset_id, phone, sid),
|
|
68
|
+
)
|
|
69
|
+
conn.commit()
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning("put_signal_identity failed: %s", e)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Per-source settings: enabled, last_sync_at, last_error (for local_sync sources)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.storage.source_settings")
|
|
9
|
+
|
|
10
|
+
TABLE = "user_ingestion_sources"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ensure_table(conn) -> None:
|
|
14
|
+
conn.execute(f"""
|
|
15
|
+
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
16
|
+
dataset_id TEXT NOT NULL,
|
|
17
|
+
source_id TEXT NOT NULL,
|
|
18
|
+
enabled INTEGER NOT NULL DEFAULT 1,
|
|
19
|
+
last_sync_at TEXT,
|
|
20
|
+
last_error TEXT,
|
|
21
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
22
|
+
PRIMARY KEY (dataset_id, source_id)
|
|
23
|
+
)
|
|
24
|
+
""")
|
|
25
|
+
conn.commit()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_source_settings(conn, dataset_id: str, source_id: str) -> Optional[dict]:
|
|
29
|
+
"""Return { enabled, last_sync_at, last_error } or None (then defaults: enabled true, no last_*)."""
|
|
30
|
+
if not conn or not dataset_id or not source_id:
|
|
31
|
+
return None
|
|
32
|
+
try:
|
|
33
|
+
ensure_table(conn)
|
|
34
|
+
row = conn.execute(
|
|
35
|
+
f"SELECT enabled, last_sync_at, last_error FROM {TABLE} WHERE dataset_id = ? AND source_id = ?",
|
|
36
|
+
(dataset_id, source_id),
|
|
37
|
+
).fetchone()
|
|
38
|
+
if not row:
|
|
39
|
+
return {"enabled": True, "last_sync_at": None, "last_error": None}
|
|
40
|
+
return {"enabled": bool(row[0]), "last_sync_at": row[1], "last_error": row[2]}
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.warning("get_source_settings failed: %s", e)
|
|
43
|
+
return {"enabled": True, "last_sync_at": None, "last_error": None}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def put_source_settings(
|
|
47
|
+
conn,
|
|
48
|
+
dataset_id: str,
|
|
49
|
+
source_id: str,
|
|
50
|
+
*,
|
|
51
|
+
enabled: Optional[bool] = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""Update enabled; leave last_sync_at/last_error unchanged."""
|
|
54
|
+
if not conn or not dataset_id or not source_id or enabled is None:
|
|
55
|
+
return
|
|
56
|
+
try:
|
|
57
|
+
ensure_table(conn)
|
|
58
|
+
cur = conn.execute(
|
|
59
|
+
f"SELECT 1 FROM {TABLE} WHERE dataset_id = ? AND source_id = ?",
|
|
60
|
+
(dataset_id, source_id),
|
|
61
|
+
).fetchone()
|
|
62
|
+
if cur:
|
|
63
|
+
conn.execute(
|
|
64
|
+
f"UPDATE {TABLE} SET enabled = ?, updated_at = datetime('now') WHERE dataset_id = ? AND source_id = ?",
|
|
65
|
+
(1 if enabled else 0, dataset_id, source_id),
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
conn.execute(
|
|
69
|
+
f"INSERT INTO {TABLE} (dataset_id, source_id, enabled, updated_at) VALUES (?, ?, ?, datetime('now'))",
|
|
70
|
+
(dataset_id, source_id, 1 if enabled else 0),
|
|
71
|
+
)
|
|
72
|
+
conn.commit()
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning("put_source_settings failed: %s", e)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def update_sync_result(
|
|
78
|
+
conn,
|
|
79
|
+
dataset_id: str,
|
|
80
|
+
source_id: str,
|
|
81
|
+
*,
|
|
82
|
+
success: bool,
|
|
83
|
+
last_sync_at: Optional[str] = None,
|
|
84
|
+
last_error: Optional[str] = None,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""After sync: set last_sync_at (and clear last_error) on success, or set last_error on failure."""
|
|
87
|
+
if not conn or not dataset_id or not source_id:
|
|
88
|
+
return
|
|
89
|
+
try:
|
|
90
|
+
ensure_table(conn)
|
|
91
|
+
if success:
|
|
92
|
+
conn.execute(
|
|
93
|
+
f"""
|
|
94
|
+
INSERT INTO {TABLE} (dataset_id, source_id, enabled, last_sync_at, last_error, updated_at)
|
|
95
|
+
VALUES (?, ?, 1, ?, NULL, datetime('now'))
|
|
96
|
+
ON CONFLICT(dataset_id, source_id) DO UPDATE SET
|
|
97
|
+
last_sync_at = ?,
|
|
98
|
+
last_error = NULL,
|
|
99
|
+
updated_at = datetime('now')
|
|
100
|
+
""",
|
|
101
|
+
(dataset_id, source_id, last_sync_at or "", last_sync_at or ""),
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
conn.execute(
|
|
105
|
+
f"""
|
|
106
|
+
INSERT INTO {TABLE} (dataset_id, source_id, enabled, last_sync_at, last_error, updated_at)
|
|
107
|
+
VALUES (?, ?, 1, NULL, ?, datetime('now'))
|
|
108
|
+
ON CONFLICT(dataset_id, source_id) DO UPDATE SET
|
|
109
|
+
last_error = ?,
|
|
110
|
+
updated_at = datetime('now')
|
|
111
|
+
""",
|
|
112
|
+
(dataset_id, source_id, last_error or "", last_error or ""),
|
|
113
|
+
)
|
|
114
|
+
conn.commit()
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.warning("update_sync_result failed: %s", e)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Dataset-scoped owner identity storage for self-name resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.storage.user_identity")
|
|
9
|
+
|
|
10
|
+
TABLE = "user_identity"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ensure_table(conn) -> None:
|
|
14
|
+
"""Create ``user_identity`` table if it does not exist."""
|
|
15
|
+
conn.execute(
|
|
16
|
+
f"""
|
|
17
|
+
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
18
|
+
dataset_id TEXT NOT NULL PRIMARY KEY,
|
|
19
|
+
display_name TEXT,
|
|
20
|
+
updated_at TEXT DEFAULT (datetime('now'))
|
|
21
|
+
)
|
|
22
|
+
"""
|
|
23
|
+
)
|
|
24
|
+
conn.commit()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_user_identity(conn, dataset_id: str) -> Optional[dict]:
|
|
28
|
+
"""Return ``{display_name}`` for a dataset, or ``None`` when absent."""
|
|
29
|
+
if not conn or not dataset_id:
|
|
30
|
+
return None
|
|
31
|
+
try:
|
|
32
|
+
ensure_table(conn)
|
|
33
|
+
row = conn.execute(
|
|
34
|
+
f"SELECT display_name FROM {TABLE} WHERE dataset_id = ?",
|
|
35
|
+
(dataset_id,),
|
|
36
|
+
).fetchone()
|
|
37
|
+
if not row:
|
|
38
|
+
return None
|
|
39
|
+
return {"display_name": row[0]}
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning("get_user_identity failed: %s", e)
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def put_user_identity(
|
|
46
|
+
conn,
|
|
47
|
+
dataset_id: str,
|
|
48
|
+
*,
|
|
49
|
+
display_name: Optional[str] = None,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Set dataset-scoped owner identity. ``None`` leaves the field unchanged."""
|
|
52
|
+
if not conn or not dataset_id:
|
|
53
|
+
return
|
|
54
|
+
try:
|
|
55
|
+
ensure_table(conn)
|
|
56
|
+
existing = get_user_identity(conn, dataset_id)
|
|
57
|
+
next_display_name = (
|
|
58
|
+
display_name if display_name is not None else (existing.get("display_name") if existing else None)
|
|
59
|
+
)
|
|
60
|
+
conn.execute(
|
|
61
|
+
f"""
|
|
62
|
+
INSERT OR REPLACE INTO {TABLE} (dataset_id, display_name, updated_at)
|
|
63
|
+
VALUES (?, ?, datetime('now'))
|
|
64
|
+
""",
|
|
65
|
+
(dataset_id, next_display_name),
|
|
66
|
+
)
|
|
67
|
+
conn.commit()
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning("put_user_identity failed: %s", e)
|