topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""File parsers for ingestion files (JSONL, JSON, CSV)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any, AsyncIterator, Dict
|
|
9
|
+
|
|
10
|
+
from .parsers.chatgpt_conversation_flattener import (
|
|
11
|
+
flatten_conversation_array,
|
|
12
|
+
is_conversation_format,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("topos.ingestion.parser")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _strip_json_comments(content: str) -> str:
|
|
19
|
+
"""Remove // and /* */ comments while preserving string literals."""
|
|
20
|
+
out: list[str] = []
|
|
21
|
+
i = 0
|
|
22
|
+
n = len(content)
|
|
23
|
+
in_string = False
|
|
24
|
+
escaped = False
|
|
25
|
+
|
|
26
|
+
while i < n:
|
|
27
|
+
ch = content[i]
|
|
28
|
+
nxt = content[i + 1] if i + 1 < n else ""
|
|
29
|
+
|
|
30
|
+
if in_string:
|
|
31
|
+
out.append(ch)
|
|
32
|
+
if escaped:
|
|
33
|
+
escaped = False
|
|
34
|
+
elif ch == "\\":
|
|
35
|
+
escaped = True
|
|
36
|
+
elif ch == '"':
|
|
37
|
+
in_string = False
|
|
38
|
+
i += 1
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
if ch == '"':
|
|
42
|
+
in_string = True
|
|
43
|
+
out.append(ch)
|
|
44
|
+
i += 1
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
if ch == "/" and nxt == "/":
|
|
48
|
+
i += 2
|
|
49
|
+
while i < n and content[i] not in "\r\n":
|
|
50
|
+
i += 1
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
if ch == "/" and nxt == "*":
|
|
54
|
+
i += 2
|
|
55
|
+
while i < n - 1 and not (content[i] == "*" and content[i + 1] == "/"):
|
|
56
|
+
if content[i] in "\r\n":
|
|
57
|
+
out.append(content[i])
|
|
58
|
+
i += 1
|
|
59
|
+
i += 2 if i < n - 1 else 0
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
out.append(ch)
|
|
63
|
+
i += 1
|
|
64
|
+
|
|
65
|
+
return "".join(out)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _load_json_with_optional_comments(content: str) -> Any:
|
|
69
|
+
try:
|
|
70
|
+
return json.loads(content)
|
|
71
|
+
except json.JSONDecodeError as exc:
|
|
72
|
+
stripped = _strip_json_comments(content)
|
|
73
|
+
if stripped != content:
|
|
74
|
+
try:
|
|
75
|
+
logger.info("Parsed JSON payload after stripping comments")
|
|
76
|
+
return json.loads(stripped)
|
|
77
|
+
except json.JSONDecodeError as commented_exc:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"Failed to parse JSON file: "
|
|
80
|
+
f"{commented_exc.msg} (line {commented_exc.lineno}, column {commented_exc.colno})"
|
|
81
|
+
) from commented_exc
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Failed to parse JSON file: {exc.msg} (line {exc.lineno}, column {exc.colno})"
|
|
84
|
+
) from exc
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
async def parse_jsonl_stream(file_stream: AsyncIterator[bytes]) -> AsyncIterator[Dict[str, Any]]:
|
|
88
|
+
buffer = b""
|
|
89
|
+
line_num = 0
|
|
90
|
+
async for chunk in file_stream:
|
|
91
|
+
buffer += chunk
|
|
92
|
+
while b"\n" in buffer:
|
|
93
|
+
line, buffer = buffer.split(b"\n", 1)
|
|
94
|
+
line = line.strip()
|
|
95
|
+
if not line:
|
|
96
|
+
continue
|
|
97
|
+
line_num += 1
|
|
98
|
+
try:
|
|
99
|
+
record = json.loads(line.decode("utf-8"))
|
|
100
|
+
yield record
|
|
101
|
+
except json.JSONDecodeError as exc:
|
|
102
|
+
logger.warning("Failed to parse JSONL line %d: %s", line_num, exc)
|
|
103
|
+
continue
|
|
104
|
+
if buffer.strip():
|
|
105
|
+
line_num += 1
|
|
106
|
+
try:
|
|
107
|
+
record = json.loads(buffer.decode("utf-8"))
|
|
108
|
+
yield record
|
|
109
|
+
except json.JSONDecodeError as exc:
|
|
110
|
+
logger.warning("Failed to parse JSONL line %d: %s", line_num, exc)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def parse_json_stream(file_stream: AsyncIterator[bytes]) -> AsyncIterator[Dict[str, Any]]:
|
|
114
|
+
chunks = []
|
|
115
|
+
async for chunk in file_stream:
|
|
116
|
+
chunks.append(chunk)
|
|
117
|
+
content = b"".join(chunks).decode("utf-8")
|
|
118
|
+
data = _load_json_with_optional_comments(content)
|
|
119
|
+
if isinstance(data, list):
|
|
120
|
+
# Check if this is a ChatGPT conversation array
|
|
121
|
+
if data and is_conversation_format(data[0]):
|
|
122
|
+
logger.info("Detected ChatGPT conversation format, flattening conversations")
|
|
123
|
+
# Flatten conversation array to individual message records
|
|
124
|
+
for record in flatten_conversation_array(data, include_system=False):
|
|
125
|
+
yield record
|
|
126
|
+
else:
|
|
127
|
+
# Regular array - yield records as-is
|
|
128
|
+
for record in data:
|
|
129
|
+
yield record
|
|
130
|
+
elif isinstance(data, dict):
|
|
131
|
+
# Check if single conversation object
|
|
132
|
+
if is_conversation_format(data):
|
|
133
|
+
logger.info("Detected ChatGPT conversation format (single object), flattening")
|
|
134
|
+
from .parsers.chatgpt_conversation_flattener import flatten_conversation
|
|
135
|
+
for record in flatten_conversation(data, include_system=False):
|
|
136
|
+
yield record
|
|
137
|
+
elif isinstance(data.get("browsing_history"), list):
|
|
138
|
+
# Demo browser-history payloads wrap visit rows under a top-level key.
|
|
139
|
+
# Flatten to per-visit records so source parsers can validate normally.
|
|
140
|
+
owner_user_id = data.get("user_id")
|
|
141
|
+
for record in data.get("browsing_history") or []:
|
|
142
|
+
if isinstance(record, dict):
|
|
143
|
+
if owner_user_id and "user_id" not in record:
|
|
144
|
+
record = {**record, "user_id": owner_user_id}
|
|
145
|
+
yield record
|
|
146
|
+
else:
|
|
147
|
+
yield data
|
|
148
|
+
else:
|
|
149
|
+
raise ValueError(f"JSON must be array or object, got {type(data)}")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
async def parse_csv_stream(file_stream: AsyncIterator[bytes], delimiter: str = ",") -> AsyncIterator[Dict[str, Any]]:
|
|
153
|
+
chunks = []
|
|
154
|
+
async for chunk in file_stream:
|
|
155
|
+
chunks.append(chunk)
|
|
156
|
+
content = b"".join(chunks).decode("utf-8")
|
|
157
|
+
reader = csv.DictReader(content.splitlines(), delimiter=delimiter)
|
|
158
|
+
for row in reader:
|
|
159
|
+
yield row
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def parse_file(file_stream: AsyncIterator[bytes], file_format: str) -> AsyncIterator[Dict[str, Any]]:
|
|
163
|
+
format_lower = file_format.lower()
|
|
164
|
+
if format_lower in {"jsonl", "ndjson"}:
|
|
165
|
+
async for record in parse_jsonl_stream(file_stream):
|
|
166
|
+
yield record
|
|
167
|
+
elif format_lower == "json":
|
|
168
|
+
async for record in parse_json_stream(file_stream):
|
|
169
|
+
yield record
|
|
170
|
+
elif format_lower == "csv":
|
|
171
|
+
async for record in parse_csv_stream(file_stream):
|
|
172
|
+
yield record
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(f"Unsupported file format: {file_format}")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Parser registry."""
|
|
2
|
+
|
|
3
|
+
from .base import Parser
|
|
4
|
+
from .browser_parser import BrowserParser, BrowserEventsParser
|
|
5
|
+
from .calendar_parser import CalendarParser
|
|
6
|
+
from .chatgpt_parser import ChatGPTParser
|
|
7
|
+
from .grok_parser import GrokParser
|
|
8
|
+
from .messenger_parser import ImessageParser, SignalParser
|
|
9
|
+
|
|
10
|
+
PARSER_REGISTRY = {
|
|
11
|
+
"chatgpt.conversation.v1": ChatGPTParser,
|
|
12
|
+
"chatgpt.conversation.v2": ChatGPTParser, # Same parser, flattened records match v1 format
|
|
13
|
+
"grok.conversation.v1": GrokParser,
|
|
14
|
+
"calendar.events.v1": CalendarParser,
|
|
15
|
+
"browser.visits.v1": BrowserParser, # Sprint 3: Browser plugin visits
|
|
16
|
+
"managed.file.browser_history_dem.v1": BrowserParser, # Hosted browser demo source alias
|
|
17
|
+
"browser.events.v1": BrowserEventsParser, # Clicks, highlights, star_page, VIDEO_PLAY
|
|
18
|
+
"imessage.messages.v1": ImessageParser, # Sprint 02: Messenger ingestion
|
|
19
|
+
"signal.messages.v1": SignalParser,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"Parser",
|
|
24
|
+
"BrowserParser",
|
|
25
|
+
"BrowserEventsParser",
|
|
26
|
+
"ChatGPTParser",
|
|
27
|
+
"GrokParser",
|
|
28
|
+
"CalendarParser",
|
|
29
|
+
"ImessageParser",
|
|
30
|
+
"SignalParser",
|
|
31
|
+
"PARSER_REGISTRY",
|
|
32
|
+
]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from ..sources.base import RawRecord
|
|
7
|
+
from ..validation.base import ValidationResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class NormalizedRecord:
|
|
12
|
+
record_id: str
|
|
13
|
+
payload: Dict[str, str]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Parser:
|
|
17
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
def schema_id(self) -> str:
|
|
24
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Browser visits parser for ingestion layer (Sprint 3)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
from ..sources.base import RawRecord
|
|
11
|
+
from ..validation.base import ValidationResult
|
|
12
|
+
from .base import NormalizedRecord, Parser
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.ingestion.parser.browser")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BrowserParser(Parser):
|
|
19
|
+
"""Parser for browser_visits records. Stores records as-is with minimal normalization."""
|
|
20
|
+
|
|
21
|
+
dataset_id: str
|
|
22
|
+
_schema_id: str = "browser.visits.v1"
|
|
23
|
+
|
|
24
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
25
|
+
"""Parse browser visit record. For MVP, we store records as-is."""
|
|
26
|
+
payload = raw.payload
|
|
27
|
+
|
|
28
|
+
# Normalize timestamp if present
|
|
29
|
+
visited_at = payload.get("visited_at") or payload.get("timestamp")
|
|
30
|
+
if isinstance(visited_at, str):
|
|
31
|
+
# Already ISO format, keep as-is
|
|
32
|
+
ts = visited_at
|
|
33
|
+
elif visited_at is None:
|
|
34
|
+
# Use current time if missing
|
|
35
|
+
ts = datetime.now(timezone.utc).isoformat()
|
|
36
|
+
else:
|
|
37
|
+
ts = str(visited_at)
|
|
38
|
+
|
|
39
|
+
# Create normalized record with all browser fields preserved
|
|
40
|
+
normalized = {
|
|
41
|
+
"record_id": payload.get("url", raw.record_id) + "_" + ts, # Unique ID
|
|
42
|
+
"dataset_id": self.dataset_id,
|
|
43
|
+
"url": payload.get("url", ""),
|
|
44
|
+
"visited_at": ts,
|
|
45
|
+
"title": payload.get("title", ""),
|
|
46
|
+
"favicon_url": payload.get("favicon_url"),
|
|
47
|
+
"hostname": payload.get("hostname", ""),
|
|
48
|
+
"device_name": payload.get("device_name", ""),
|
|
49
|
+
"tab_id": payload.get("tab_id"),
|
|
50
|
+
"window_id": payload.get("window_id"),
|
|
51
|
+
"incognito": payload.get("incognito"),
|
|
52
|
+
"transition_type": payload.get("transition_type", ""),
|
|
53
|
+
"pinned": payload.get("pinned"),
|
|
54
|
+
"audible": payload.get("audible"),
|
|
55
|
+
"muted": payload.get("muted"),
|
|
56
|
+
"opener_tab_id": payload.get("opener_tab_id"),
|
|
57
|
+
"referred_by": payload.get("referred_by"),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Remove None values to keep payload clean
|
|
61
|
+
normalized = {k: v for k, v in normalized.items() if v is not None}
|
|
62
|
+
|
|
63
|
+
logger.debug(
|
|
64
|
+
"[PIPELINE:PARSER] Parsed browser visit: url=%s, visited_at=%s",
|
|
65
|
+
normalized.get("url", "")[:50],
|
|
66
|
+
normalized.get("visited_at", ""),
|
|
67
|
+
)
|
|
68
|
+
return NormalizedRecord(record_id=normalized["record_id"], payload=normalized)
|
|
69
|
+
|
|
70
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
71
|
+
"""Validate browser visit record. For MVP, minimal validation."""
|
|
72
|
+
payload = record.payload
|
|
73
|
+
if not isinstance(payload, dict):
|
|
74
|
+
return ValidationResult(
|
|
75
|
+
is_valid=False,
|
|
76
|
+
errors=["Record must be a dict"],
|
|
77
|
+
metadata={},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Required fields for browser_visits
|
|
81
|
+
if "url" not in payload:
|
|
82
|
+
return ValidationResult(
|
|
83
|
+
is_valid=False,
|
|
84
|
+
errors=["Missing required field: url"],
|
|
85
|
+
metadata={},
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if "visited_at" not in payload and "timestamp" not in payload:
|
|
89
|
+
return ValidationResult(
|
|
90
|
+
is_valid=False,
|
|
91
|
+
errors=["Missing required field: visited_at (or timestamp)"],
|
|
92
|
+
metadata={},
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return ValidationResult(is_valid=True, errors=[], metadata={})
|
|
96
|
+
|
|
97
|
+
def schema_id(self) -> str:
|
|
98
|
+
return self._schema_id
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class BrowserEventsParser(Parser):
|
|
103
|
+
"""Parser for browser_events: clicks, highlights, star_page, VIDEO_PLAY. Stores event_type + payload."""
|
|
104
|
+
|
|
105
|
+
dataset_id: str
|
|
106
|
+
_schema_id: str = "browser.events.v1"
|
|
107
|
+
|
|
108
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
109
|
+
"""Parse browser event record. Preserves event_type and full payload."""
|
|
110
|
+
payload = raw.payload
|
|
111
|
+
event_type = payload.get("event_type") or "unknown"
|
|
112
|
+
ts = (
|
|
113
|
+
payload.get("visited_at")
|
|
114
|
+
or payload.get("starred_at")
|
|
115
|
+
or payload.get("created_at")
|
|
116
|
+
or datetime.now(timezone.utc).isoformat()
|
|
117
|
+
)
|
|
118
|
+
if isinstance(ts, (int, float)):
|
|
119
|
+
ts = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
|
120
|
+
else:
|
|
121
|
+
ts = str(ts)
|
|
122
|
+
url = payload.get("url") or ""
|
|
123
|
+
record_id = f"{event_type}_{(url or raw.record_id)[:80]}_{ts[:24]}"
|
|
124
|
+
normalized = {
|
|
125
|
+
"record_id": record_id,
|
|
126
|
+
"dataset_id": self.dataset_id,
|
|
127
|
+
"event_type": event_type,
|
|
128
|
+
"url": url,
|
|
129
|
+
"visited_at": ts,
|
|
130
|
+
"title": payload.get("title"),
|
|
131
|
+
"favicon_url": payload.get("favicon_url"),
|
|
132
|
+
"hostname": payload.get("hostname"),
|
|
133
|
+
"device_name": payload.get("device_name"),
|
|
134
|
+
"transition_type": payload.get("transition_type"),
|
|
135
|
+
"content": payload.get("content"),
|
|
136
|
+
"tab_id": payload.get("tab_id"),
|
|
137
|
+
"window_id": payload.get("window_id"),
|
|
138
|
+
"incognito": payload.get("incognito"),
|
|
139
|
+
"pinned": payload.get("pinned"),
|
|
140
|
+
"audible": payload.get("audible"),
|
|
141
|
+
"muted": payload.get("muted"),
|
|
142
|
+
"opener_tab_id": payload.get("opener_tab_id"),
|
|
143
|
+
"starred_at": payload.get("starred_at"),
|
|
144
|
+
}
|
|
145
|
+
normalized = {k: v for k, v in normalized.items() if v is not None}
|
|
146
|
+
logger.debug(
|
|
147
|
+
"[PIPELINE:PARSER] Parsed browser event: event_type=%s, url=%s",
|
|
148
|
+
event_type,
|
|
149
|
+
url[:50] if url else None,
|
|
150
|
+
)
|
|
151
|
+
return NormalizedRecord(record_id=record_id, payload=normalized)
|
|
152
|
+
|
|
153
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
154
|
+
"""Validate browser event: require event_type and at least url or content."""
|
|
155
|
+
payload = record.payload
|
|
156
|
+
if not isinstance(payload, dict):
|
|
157
|
+
return ValidationResult(
|
|
158
|
+
is_valid=False,
|
|
159
|
+
errors=["Record must be a dict"],
|
|
160
|
+
metadata={},
|
|
161
|
+
)
|
|
162
|
+
if not payload.get("event_type"):
|
|
163
|
+
return ValidationResult(
|
|
164
|
+
is_valid=False,
|
|
165
|
+
errors=["Missing required field: event_type"],
|
|
166
|
+
metadata={},
|
|
167
|
+
)
|
|
168
|
+
return ValidationResult(is_valid=True, errors=[], metadata={})
|
|
169
|
+
|
|
170
|
+
def schema_id(self) -> str:
|
|
171
|
+
return self._schema_id
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from ..sources.base import RawRecord
|
|
6
|
+
from ..validation.base import ValidationResult
|
|
7
|
+
from .base import NormalizedRecord, Parser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class CalendarParser(Parser):
|
|
12
|
+
dataset_id: str
|
|
13
|
+
|
|
14
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
15
|
+
return NormalizedRecord(record_id=raw.record_id, payload=raw.payload)
|
|
16
|
+
|
|
17
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
18
|
+
return ValidationResult(is_valid=True, errors=[], metadata={})
|
|
19
|
+
|
|
20
|
+
def schema_id(self) -> str:
|
|
21
|
+
return "calendar.events.v1"
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""ChatGPT conversation flattener.
|
|
2
|
+
|
|
3
|
+
Converts nested conversation objects from ChatGPT export format into flat message records.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Iterator, List, Optional
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("topos.ingestion.parser.chatgpt_flattener")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def flatten_conversation(conversation: Dict[str, Any], include_system: bool = False) -> Iterator[Dict[str, Any]]:
|
|
15
|
+
"""Flatten a ChatGPT conversation object into individual message records.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
conversation: ChatGPT conversation object with 'mapping' field
|
|
19
|
+
include_system: Whether to include system messages (default: False)
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
Flattened message records compatible with chatgpt.conversation.v1 format
|
|
23
|
+
"""
|
|
24
|
+
mapping = conversation.get("mapping", {})
|
|
25
|
+
if not mapping:
|
|
26
|
+
logger.warning("Conversation has no mapping field")
|
|
27
|
+
return # Generator function - return without yielding means empty
|
|
28
|
+
|
|
29
|
+
conv_id = conversation.get("conversation_id") or conversation.get("id", "")
|
|
30
|
+
conv_title = conversation.get("title")
|
|
31
|
+
conv_create_time = conversation.get("create_time")
|
|
32
|
+
|
|
33
|
+
# Traverse the message tree and extract messages
|
|
34
|
+
visited = set()
|
|
35
|
+
|
|
36
|
+
def traverse_node(node_id: str) -> Iterator[Dict[str, Any]]:
|
|
37
|
+
"""Recursively traverse message nodes and yield records."""
|
|
38
|
+
if node_id in visited or node_id not in mapping:
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
visited.add(node_id)
|
|
42
|
+
node = mapping[node_id]
|
|
43
|
+
message = node.get("message")
|
|
44
|
+
|
|
45
|
+
# Skip nodes without messages (root nodes, etc.)
|
|
46
|
+
if not message:
|
|
47
|
+
# Still traverse children
|
|
48
|
+
children = node.get("children")
|
|
49
|
+
if children:
|
|
50
|
+
for child_id in children:
|
|
51
|
+
yield from traverse_node(child_id)
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
# Extract message data
|
|
55
|
+
role = message.get("author", {}).get("role", "").lower()
|
|
56
|
+
|
|
57
|
+
# Skip system messages unless explicitly included
|
|
58
|
+
if role == "system" and not include_system:
|
|
59
|
+
# Still traverse children
|
|
60
|
+
children = node.get("children")
|
|
61
|
+
if children:
|
|
62
|
+
for child_id in children:
|
|
63
|
+
yield from traverse_node(child_id)
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
# Extract content
|
|
67
|
+
content_obj = message.get("content", {})
|
|
68
|
+
content_type = content_obj.get("content_type", "text")
|
|
69
|
+
parts = content_obj.get("parts", [])
|
|
70
|
+
|
|
71
|
+
# Handle different content types
|
|
72
|
+
content = extract_content(content_obj, content_type)
|
|
73
|
+
|
|
74
|
+
# Skip messages with empty content (unless they're tool calls)
|
|
75
|
+
if not content and content_type == "text":
|
|
76
|
+
# Still traverse children (might be tool execution results)
|
|
77
|
+
children = node.get("children")
|
|
78
|
+
if children:
|
|
79
|
+
for child_id in children:
|
|
80
|
+
yield from traverse_node(child_id)
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
# Extract timestamp
|
|
84
|
+
create_time = message.get("create_time")
|
|
85
|
+
if create_time is None:
|
|
86
|
+
create_time = conv_create_time
|
|
87
|
+
|
|
88
|
+
# Map role to expected format
|
|
89
|
+
# ChatGPT uses: user, assistant, system, tool
|
|
90
|
+
# We need: user -> "user", assistant -> "assistant", tool -> "assistant"
|
|
91
|
+
mapped_role = role
|
|
92
|
+
if role == "tool":
|
|
93
|
+
mapped_role = "assistant" # Tool messages are from assistant
|
|
94
|
+
|
|
95
|
+
# Create flattened record
|
|
96
|
+
record = {
|
|
97
|
+
"id": message.get("id", node_id),
|
|
98
|
+
"thread_id": conv_id,
|
|
99
|
+
"role": mapped_role,
|
|
100
|
+
"content": content,
|
|
101
|
+
"created_at": create_time,
|
|
102
|
+
# Additional metadata (optional, for debugging)
|
|
103
|
+
"_metadata": {
|
|
104
|
+
"conversation_title": conv_title,
|
|
105
|
+
"node_id": node_id,
|
|
106
|
+
"parent_id": node.get("parent"),
|
|
107
|
+
"content_type": content_type,
|
|
108
|
+
"original_role": role,
|
|
109
|
+
},
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
yield record
|
|
113
|
+
|
|
114
|
+
# Traverse children
|
|
115
|
+
children = node.get("children")
|
|
116
|
+
if children:
|
|
117
|
+
for child_id in children:
|
|
118
|
+
yield from traverse_node(child_id)
|
|
119
|
+
|
|
120
|
+
# Find root nodes (nodes with no parent or parent not in mapping)
|
|
121
|
+
root_nodes = []
|
|
122
|
+
for node_id, node in mapping.items():
|
|
123
|
+
parent = node.get("parent")
|
|
124
|
+
if not parent or parent not in mapping:
|
|
125
|
+
root_nodes.append(node_id)
|
|
126
|
+
|
|
127
|
+
# Start traversal from root nodes
|
|
128
|
+
for root_id in root_nodes:
|
|
129
|
+
try:
|
|
130
|
+
yield from traverse_node(root_id)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.warning(f"Error traversing root node {root_id}: {e}", exc_info=True)
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_content(content_obj: Dict[str, Any], content_type: str) -> str:
|
|
137
|
+
"""Extract text content from content object based on content type.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
content_obj: Content object from message
|
|
141
|
+
content_type: Type of content (text, thoughts, reasoning_recap, etc.)
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Extracted text content
|
|
145
|
+
"""
|
|
146
|
+
if content_type == "text":
|
|
147
|
+
parts = content_obj.get("parts", [])
|
|
148
|
+
if isinstance(parts, list):
|
|
149
|
+
# Join parts, filtering out empty strings
|
|
150
|
+
return " ".join(str(p) for p in parts if p and str(p).strip())
|
|
151
|
+
return str(parts) if parts else ""
|
|
152
|
+
|
|
153
|
+
elif content_type == "thoughts":
|
|
154
|
+
# Extract thoughts content
|
|
155
|
+
thoughts = content_obj.get("thoughts", [])
|
|
156
|
+
if isinstance(thoughts, list):
|
|
157
|
+
# Extract summary or content from each thought
|
|
158
|
+
thought_texts = []
|
|
159
|
+
for thought in thoughts:
|
|
160
|
+
if isinstance(thought, dict):
|
|
161
|
+
summary = thought.get("summary", "")
|
|
162
|
+
content = thought.get("content", "")
|
|
163
|
+
if summary:
|
|
164
|
+
thought_texts.append(summary)
|
|
165
|
+
elif content:
|
|
166
|
+
thought_texts.append(content)
|
|
167
|
+
elif isinstance(thought, str):
|
|
168
|
+
thought_texts.append(thought)
|
|
169
|
+
return " ".join(thought_texts)
|
|
170
|
+
return str(thoughts) if thoughts else ""
|
|
171
|
+
|
|
172
|
+
elif content_type == "reasoning_recap":
|
|
173
|
+
# Extract reasoning recap content
|
|
174
|
+
recap = content_obj.get("reasoning_recap", "")
|
|
175
|
+
if recap:
|
|
176
|
+
return str(recap)
|
|
177
|
+
# Fallback to parts if available
|
|
178
|
+
parts = content_obj.get("parts", [])
|
|
179
|
+
if isinstance(parts, list):
|
|
180
|
+
return " ".join(str(p) for p in parts if p)
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
elif content_type == "code":
|
|
184
|
+
# Extract code content
|
|
185
|
+
code = content_obj.get("code", "")
|
|
186
|
+
if code:
|
|
187
|
+
return f"```\n{code}\n```"
|
|
188
|
+
# Fallback to parts
|
|
189
|
+
parts = content_obj.get("parts", [])
|
|
190
|
+
if isinstance(parts, list):
|
|
191
|
+
return " ".join(str(p) for p in parts if p)
|
|
192
|
+
return ""
|
|
193
|
+
|
|
194
|
+
elif content_type == "multimodal_text":
|
|
195
|
+
# Extract multimodal text (may have images, etc.)
|
|
196
|
+
parts = content_obj.get("parts", [])
|
|
197
|
+
if isinstance(parts, list):
|
|
198
|
+
# Filter out non-text parts
|
|
199
|
+
text_parts = [str(p) for p in parts if isinstance(p, str) and p.strip()]
|
|
200
|
+
return " ".join(text_parts)
|
|
201
|
+
return ""
|
|
202
|
+
|
|
203
|
+
elif content_type == "execution_output":
|
|
204
|
+
# Extract execution output
|
|
205
|
+
output = content_obj.get("output", "")
|
|
206
|
+
if output:
|
|
207
|
+
return str(output)
|
|
208
|
+
parts = content_obj.get("parts", [])
|
|
209
|
+
if isinstance(parts, list):
|
|
210
|
+
return " ".join(str(p) for p in parts if p)
|
|
211
|
+
return ""
|
|
212
|
+
|
|
213
|
+
else:
|
|
214
|
+
# Unknown content type - try to extract parts
|
|
215
|
+
logger.warning(f"Unknown content type: {content_type}")
|
|
216
|
+
parts = content_obj.get("parts", [])
|
|
217
|
+
if isinstance(parts, list):
|
|
218
|
+
return " ".join(str(p) for p in parts if p)
|
|
219
|
+
return ""
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def is_conversation_format(record: Dict[str, Any]) -> bool:
|
|
223
|
+
"""Check if a record is a ChatGPT conversation object.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
record: Record to check
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
True if record appears to be a conversation object
|
|
230
|
+
"""
|
|
231
|
+
return (
|
|
232
|
+
isinstance(record, dict) and
|
|
233
|
+
"mapping" in record and
|
|
234
|
+
isinstance(record.get("mapping"), dict) and
|
|
235
|
+
("conversation_id" in record or "id" in record)
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def flatten_conversation_array(conversations: List[Dict[str, Any]], include_system: bool = False) -> Iterator[Dict[str, Any]]:
|
|
240
|
+
"""Flatten an array of conversation objects.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
conversations: List of conversation objects
|
|
244
|
+
include_system: Whether to include system messages
|
|
245
|
+
|
|
246
|
+
Yields:
|
|
247
|
+
Flattened message records
|
|
248
|
+
"""
|
|
249
|
+
if not conversations:
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
for conv in conversations:
|
|
253
|
+
if not conv or not isinstance(conv, dict):
|
|
254
|
+
logger.warning(f"Skipping invalid conversation object: {type(conv)}")
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
if not is_conversation_format(conv):
|
|
258
|
+
logger.warning(f"Skipping non-conversation object: {type(conv)}")
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
for record in flatten_conversation(conv, include_system=include_system):
|
|
263
|
+
yield record
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.error(f"Error flattening conversation {conv.get('conversation_id', conv.get('id', 'unknown'))}: {e}", exc_info=True)
|
|
266
|
+
continue
|