topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,174 @@
1
+ """File parsers for ingestion files (JSONL, JSON, CSV)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import json
7
+ import logging
8
+ from typing import Any, AsyncIterator, Dict
9
+
10
+ from .parsers.chatgpt_conversation_flattener import (
11
+ flatten_conversation_array,
12
+ is_conversation_format,
13
+ )
14
+
15
+ logger = logging.getLogger("topos.ingestion.parser")
16
+
17
+
18
+ def _strip_json_comments(content: str) -> str:
19
+ """Remove // and /* */ comments while preserving string literals."""
20
+ out: list[str] = []
21
+ i = 0
22
+ n = len(content)
23
+ in_string = False
24
+ escaped = False
25
+
26
+ while i < n:
27
+ ch = content[i]
28
+ nxt = content[i + 1] if i + 1 < n else ""
29
+
30
+ if in_string:
31
+ out.append(ch)
32
+ if escaped:
33
+ escaped = False
34
+ elif ch == "\\":
35
+ escaped = True
36
+ elif ch == '"':
37
+ in_string = False
38
+ i += 1
39
+ continue
40
+
41
+ if ch == '"':
42
+ in_string = True
43
+ out.append(ch)
44
+ i += 1
45
+ continue
46
+
47
+ if ch == "/" and nxt == "/":
48
+ i += 2
49
+ while i < n and content[i] not in "\r\n":
50
+ i += 1
51
+ continue
52
+
53
+ if ch == "/" and nxt == "*":
54
+ i += 2
55
+ while i < n - 1 and not (content[i] == "*" and content[i + 1] == "/"):
56
+ if content[i] in "\r\n":
57
+ out.append(content[i])
58
+ i += 1
59
+ i += 2 if i < n - 1 else 0
60
+ continue
61
+
62
+ out.append(ch)
63
+ i += 1
64
+
65
+ return "".join(out)
66
+
67
+
68
+ def _load_json_with_optional_comments(content: str) -> Any:
69
+ try:
70
+ return json.loads(content)
71
+ except json.JSONDecodeError as exc:
72
+ stripped = _strip_json_comments(content)
73
+ if stripped != content:
74
+ try:
75
+ logger.info("Parsed JSON payload after stripping comments")
76
+ return json.loads(stripped)
77
+ except json.JSONDecodeError as commented_exc:
78
+ raise ValueError(
79
+ "Failed to parse JSON file: "
80
+ f"{commented_exc.msg} (line {commented_exc.lineno}, column {commented_exc.colno})"
81
+ ) from commented_exc
82
+ raise ValueError(
83
+ f"Failed to parse JSON file: {exc.msg} (line {exc.lineno}, column {exc.colno})"
84
+ ) from exc
85
+
86
+
87
+ async def parse_jsonl_stream(file_stream: AsyncIterator[bytes]) -> AsyncIterator[Dict[str, Any]]:
88
+ buffer = b""
89
+ line_num = 0
90
+ async for chunk in file_stream:
91
+ buffer += chunk
92
+ while b"\n" in buffer:
93
+ line, buffer = buffer.split(b"\n", 1)
94
+ line = line.strip()
95
+ if not line:
96
+ continue
97
+ line_num += 1
98
+ try:
99
+ record = json.loads(line.decode("utf-8"))
100
+ yield record
101
+ except json.JSONDecodeError as exc:
102
+ logger.warning("Failed to parse JSONL line %d: %s", line_num, exc)
103
+ continue
104
+ if buffer.strip():
105
+ line_num += 1
106
+ try:
107
+ record = json.loads(buffer.decode("utf-8"))
108
+ yield record
109
+ except json.JSONDecodeError as exc:
110
+ logger.warning("Failed to parse JSONL line %d: %s", line_num, exc)
111
+
112
+
113
+ async def parse_json_stream(file_stream: AsyncIterator[bytes]) -> AsyncIterator[Dict[str, Any]]:
114
+ chunks = []
115
+ async for chunk in file_stream:
116
+ chunks.append(chunk)
117
+ content = b"".join(chunks).decode("utf-8")
118
+ data = _load_json_with_optional_comments(content)
119
+ if isinstance(data, list):
120
+ # Check if this is a ChatGPT conversation array
121
+ if data and is_conversation_format(data[0]):
122
+ logger.info("Detected ChatGPT conversation format, flattening conversations")
123
+ # Flatten conversation array to individual message records
124
+ for record in flatten_conversation_array(data, include_system=False):
125
+ yield record
126
+ else:
127
+ # Regular array - yield records as-is
128
+ for record in data:
129
+ yield record
130
+ elif isinstance(data, dict):
131
+ # Check if single conversation object
132
+ if is_conversation_format(data):
133
+ logger.info("Detected ChatGPT conversation format (single object), flattening")
134
+ from .parsers.chatgpt_conversation_flattener import flatten_conversation
135
+ for record in flatten_conversation(data, include_system=False):
136
+ yield record
137
+ elif isinstance(data.get("browsing_history"), list):
138
+ # Demo browser-history payloads wrap visit rows under a top-level key.
139
+ # Flatten to per-visit records so source parsers can validate normally.
140
+ owner_user_id = data.get("user_id")
141
+ for record in data.get("browsing_history") or []:
142
+ if isinstance(record, dict):
143
+ if owner_user_id and "user_id" not in record:
144
+ record = {**record, "user_id": owner_user_id}
145
+ yield record
146
+ else:
147
+ yield data
148
+ else:
149
+ raise ValueError(f"JSON must be array or object, got {type(data)}")
150
+
151
+
152
+ async def parse_csv_stream(file_stream: AsyncIterator[bytes], delimiter: str = ",") -> AsyncIterator[Dict[str, Any]]:
153
+ chunks = []
154
+ async for chunk in file_stream:
155
+ chunks.append(chunk)
156
+ content = b"".join(chunks).decode("utf-8")
157
+ reader = csv.DictReader(content.splitlines(), delimiter=delimiter)
158
+ for row in reader:
159
+ yield row
160
+
161
+
162
+ async def parse_file(file_stream: AsyncIterator[bytes], file_format: str) -> AsyncIterator[Dict[str, Any]]:
163
+ format_lower = file_format.lower()
164
+ if format_lower in {"jsonl", "ndjson"}:
165
+ async for record in parse_jsonl_stream(file_stream):
166
+ yield record
167
+ elif format_lower == "json":
168
+ async for record in parse_json_stream(file_stream):
169
+ yield record
170
+ elif format_lower == "csv":
171
+ async for record in parse_csv_stream(file_stream):
172
+ yield record
173
+ else:
174
+ raise ValueError(f"Unsupported file format: {file_format}")
@@ -0,0 +1,32 @@
1
+ """Parser registry."""
2
+
3
+ from .base import Parser
4
+ from .browser_parser import BrowserParser, BrowserEventsParser
5
+ from .calendar_parser import CalendarParser
6
+ from .chatgpt_parser import ChatGPTParser
7
+ from .grok_parser import GrokParser
8
+ from .messenger_parser import ImessageParser, SignalParser
9
+
10
+ PARSER_REGISTRY = {
11
+ "chatgpt.conversation.v1": ChatGPTParser,
12
+ "chatgpt.conversation.v2": ChatGPTParser, # Same parser, flattened records match v1 format
13
+ "grok.conversation.v1": GrokParser,
14
+ "calendar.events.v1": CalendarParser,
15
+ "browser.visits.v1": BrowserParser, # Sprint 3: Browser plugin visits
16
+ "managed.file.browser_history_dem.v1": BrowserParser, # Hosted browser demo source alias
17
+ "browser.events.v1": BrowserEventsParser, # Clicks, highlights, star_page, VIDEO_PLAY
18
+ "imessage.messages.v1": ImessageParser, # Sprint 02: Messenger ingestion
19
+ "signal.messages.v1": SignalParser,
20
+ }
21
+
22
+ __all__ = [
23
+ "Parser",
24
+ "BrowserParser",
25
+ "BrowserEventsParser",
26
+ "ChatGPTParser",
27
+ "GrokParser",
28
+ "CalendarParser",
29
+ "ImessageParser",
30
+ "SignalParser",
31
+ "PARSER_REGISTRY",
32
+ ]
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict
5
+
6
+ from ..sources.base import RawRecord
7
+ from ..validation.base import ValidationResult
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class NormalizedRecord:
12
+ record_id: str
13
+ payload: Dict[str, str]
14
+
15
+
16
+ class Parser:
17
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
18
+ raise NotImplementedError
19
+
20
+ def validate(self, record: RawRecord) -> ValidationResult:
21
+ raise NotImplementedError
22
+
23
+ def schema_id(self) -> str:
24
+ raise NotImplementedError
@@ -0,0 +1,171 @@
1
+ """Browser visits parser for ingestion layer (Sprint 3)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from typing import Any, Dict
8
+ from datetime import datetime, timezone
9
+
10
+ from ..sources.base import RawRecord
11
+ from ..validation.base import ValidationResult
12
+ from .base import NormalizedRecord, Parser
13
+
14
+ logger = logging.getLogger("topos.ingestion.parser.browser")
15
+
16
+
17
+ @dataclass
18
+ class BrowserParser(Parser):
19
+ """Parser for browser_visits records. Stores records as-is with minimal normalization."""
20
+
21
+ dataset_id: str
22
+ _schema_id: str = "browser.visits.v1"
23
+
24
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
25
+ """Parse browser visit record. For MVP, we store records as-is."""
26
+ payload = raw.payload
27
+
28
+ # Normalize timestamp if present
29
+ visited_at = payload.get("visited_at") or payload.get("timestamp")
30
+ if isinstance(visited_at, str):
31
+ # Already ISO format, keep as-is
32
+ ts = visited_at
33
+ elif visited_at is None:
34
+ # Use current time if missing
35
+ ts = datetime.now(timezone.utc).isoformat()
36
+ else:
37
+ ts = str(visited_at)
38
+
39
+ # Create normalized record with all browser fields preserved
40
+ normalized = {
41
+ "record_id": payload.get("url", raw.record_id) + "_" + ts, # Unique ID
42
+ "dataset_id": self.dataset_id,
43
+ "url": payload.get("url", ""),
44
+ "visited_at": ts,
45
+ "title": payload.get("title", ""),
46
+ "favicon_url": payload.get("favicon_url"),
47
+ "hostname": payload.get("hostname", ""),
48
+ "device_name": payload.get("device_name", ""),
49
+ "tab_id": payload.get("tab_id"),
50
+ "window_id": payload.get("window_id"),
51
+ "incognito": payload.get("incognito"),
52
+ "transition_type": payload.get("transition_type", ""),
53
+ "pinned": payload.get("pinned"),
54
+ "audible": payload.get("audible"),
55
+ "muted": payload.get("muted"),
56
+ "opener_tab_id": payload.get("opener_tab_id"),
57
+ "referred_by": payload.get("referred_by"),
58
+ }
59
+
60
+ # Remove None values to keep payload clean
61
+ normalized = {k: v for k, v in normalized.items() if v is not None}
62
+
63
+ logger.debug(
64
+ "[PIPELINE:PARSER] Parsed browser visit: url=%s, visited_at=%s",
65
+ normalized.get("url", "")[:50],
66
+ normalized.get("visited_at", ""),
67
+ )
68
+ return NormalizedRecord(record_id=normalized["record_id"], payload=normalized)
69
+
70
+ def validate(self, record: RawRecord) -> ValidationResult:
71
+ """Validate browser visit record. For MVP, minimal validation."""
72
+ payload = record.payload
73
+ if not isinstance(payload, dict):
74
+ return ValidationResult(
75
+ is_valid=False,
76
+ errors=["Record must be a dict"],
77
+ metadata={},
78
+ )
79
+
80
+ # Required fields for browser_visits
81
+ if "url" not in payload:
82
+ return ValidationResult(
83
+ is_valid=False,
84
+ errors=["Missing required field: url"],
85
+ metadata={},
86
+ )
87
+
88
+ if "visited_at" not in payload and "timestamp" not in payload:
89
+ return ValidationResult(
90
+ is_valid=False,
91
+ errors=["Missing required field: visited_at (or timestamp)"],
92
+ metadata={},
93
+ )
94
+
95
+ return ValidationResult(is_valid=True, errors=[], metadata={})
96
+
97
+ def schema_id(self) -> str:
98
+ return self._schema_id
99
+
100
+
101
+ @dataclass
102
+ class BrowserEventsParser(Parser):
103
+ """Parser for browser_events: clicks, highlights, star_page, VIDEO_PLAY. Stores event_type + payload."""
104
+
105
+ dataset_id: str
106
+ _schema_id: str = "browser.events.v1"
107
+
108
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
109
+ """Parse browser event record. Preserves event_type and full payload."""
110
+ payload = raw.payload
111
+ event_type = payload.get("event_type") or "unknown"
112
+ ts = (
113
+ payload.get("visited_at")
114
+ or payload.get("starred_at")
115
+ or payload.get("created_at")
116
+ or datetime.now(timezone.utc).isoformat()
117
+ )
118
+ if isinstance(ts, (int, float)):
119
+ ts = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
120
+ else:
121
+ ts = str(ts)
122
+ url = payload.get("url") or ""
123
+ record_id = f"{event_type}_{(url or raw.record_id)[:80]}_{ts[:24]}"
124
+ normalized = {
125
+ "record_id": record_id,
126
+ "dataset_id": self.dataset_id,
127
+ "event_type": event_type,
128
+ "url": url,
129
+ "visited_at": ts,
130
+ "title": payload.get("title"),
131
+ "favicon_url": payload.get("favicon_url"),
132
+ "hostname": payload.get("hostname"),
133
+ "device_name": payload.get("device_name"),
134
+ "transition_type": payload.get("transition_type"),
135
+ "content": payload.get("content"),
136
+ "tab_id": payload.get("tab_id"),
137
+ "window_id": payload.get("window_id"),
138
+ "incognito": payload.get("incognito"),
139
+ "pinned": payload.get("pinned"),
140
+ "audible": payload.get("audible"),
141
+ "muted": payload.get("muted"),
142
+ "opener_tab_id": payload.get("opener_tab_id"),
143
+ "starred_at": payload.get("starred_at"),
144
+ }
145
+ normalized = {k: v for k, v in normalized.items() if v is not None}
146
+ logger.debug(
147
+ "[PIPELINE:PARSER] Parsed browser event: event_type=%s, url=%s",
148
+ event_type,
149
+ url[:50] if url else None,
150
+ )
151
+ return NormalizedRecord(record_id=record_id, payload=normalized)
152
+
153
+ def validate(self, record: RawRecord) -> ValidationResult:
154
+ """Validate browser event: require event_type and at least url or content."""
155
+ payload = record.payload
156
+ if not isinstance(payload, dict):
157
+ return ValidationResult(
158
+ is_valid=False,
159
+ errors=["Record must be a dict"],
160
+ metadata={},
161
+ )
162
+ if not payload.get("event_type"):
163
+ return ValidationResult(
164
+ is_valid=False,
165
+ errors=["Missing required field: event_type"],
166
+ metadata={},
167
+ )
168
+ return ValidationResult(is_valid=True, errors=[], metadata={})
169
+
170
+ def schema_id(self) -> str:
171
+ return self._schema_id
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from ..sources.base import RawRecord
6
+ from ..validation.base import ValidationResult
7
+ from .base import NormalizedRecord, Parser
8
+
9
+
10
+ @dataclass
11
+ class CalendarParser(Parser):
12
+ dataset_id: str
13
+
14
+ def parse(self, raw: RawRecord) -> NormalizedRecord:
15
+ return NormalizedRecord(record_id=raw.record_id, payload=raw.payload)
16
+
17
+ def validate(self, record: RawRecord) -> ValidationResult:
18
+ return ValidationResult(is_valid=True, errors=[], metadata={})
19
+
20
+ def schema_id(self) -> str:
21
+ return "calendar.events.v1"
@@ -0,0 +1,266 @@
1
+ """ChatGPT conversation flattener.
2
+
3
+ Converts nested conversation objects from ChatGPT export format into flat message records.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from typing import Any, Dict, Iterator, List, Optional
10
+
11
+ logger = logging.getLogger("topos.ingestion.parser.chatgpt_flattener")
12
+
13
+
14
+ def flatten_conversation(conversation: Dict[str, Any], include_system: bool = False) -> Iterator[Dict[str, Any]]:
15
+ """Flatten a ChatGPT conversation object into individual message records.
16
+
17
+ Args:
18
+ conversation: ChatGPT conversation object with 'mapping' field
19
+ include_system: Whether to include system messages (default: False)
20
+
21
+ Yields:
22
+ Flattened message records compatible with chatgpt.conversation.v1 format
23
+ """
24
+ mapping = conversation.get("mapping", {})
25
+ if not mapping:
26
+ logger.warning("Conversation has no mapping field")
27
+ return # Generator function - return without yielding means empty
28
+
29
+ conv_id = conversation.get("conversation_id") or conversation.get("id", "")
30
+ conv_title = conversation.get("title")
31
+ conv_create_time = conversation.get("create_time")
32
+
33
+ # Traverse the message tree and extract messages
34
+ visited = set()
35
+
36
+ def traverse_node(node_id: str) -> Iterator[Dict[str, Any]]:
37
+ """Recursively traverse message nodes and yield records."""
38
+ if node_id in visited or node_id not in mapping:
39
+ return
40
+
41
+ visited.add(node_id)
42
+ node = mapping[node_id]
43
+ message = node.get("message")
44
+
45
+ # Skip nodes without messages (root nodes, etc.)
46
+ if not message:
47
+ # Still traverse children
48
+ children = node.get("children")
49
+ if children:
50
+ for child_id in children:
51
+ yield from traverse_node(child_id)
52
+ return
53
+
54
+ # Extract message data
55
+ role = message.get("author", {}).get("role", "").lower()
56
+
57
+ # Skip system messages unless explicitly included
58
+ if role == "system" and not include_system:
59
+ # Still traverse children
60
+ children = node.get("children")
61
+ if children:
62
+ for child_id in children:
63
+ yield from traverse_node(child_id)
64
+ return
65
+
66
+ # Extract content
67
+ content_obj = message.get("content", {})
68
+ content_type = content_obj.get("content_type", "text")
69
+ parts = content_obj.get("parts", [])
70
+
71
+ # Handle different content types
72
+ content = extract_content(content_obj, content_type)
73
+
74
+ # Skip messages with empty content (unless they're tool calls)
75
+ if not content and content_type == "text":
76
+ # Still traverse children (might be tool execution results)
77
+ children = node.get("children")
78
+ if children:
79
+ for child_id in children:
80
+ yield from traverse_node(child_id)
81
+ return
82
+
83
+ # Extract timestamp
84
+ create_time = message.get("create_time")
85
+ if create_time is None:
86
+ create_time = conv_create_time
87
+
88
+ # Map role to expected format
89
+ # ChatGPT uses: user, assistant, system, tool
90
+ # We need: user -> "user", assistant -> "assistant", tool -> "assistant"
91
+ mapped_role = role
92
+ if role == "tool":
93
+ mapped_role = "assistant" # Tool messages are from assistant
94
+
95
+ # Create flattened record
96
+ record = {
97
+ "id": message.get("id", node_id),
98
+ "thread_id": conv_id,
99
+ "role": mapped_role,
100
+ "content": content,
101
+ "created_at": create_time,
102
+ # Additional metadata (optional, for debugging)
103
+ "_metadata": {
104
+ "conversation_title": conv_title,
105
+ "node_id": node_id,
106
+ "parent_id": node.get("parent"),
107
+ "content_type": content_type,
108
+ "original_role": role,
109
+ },
110
+ }
111
+
112
+ yield record
113
+
114
+ # Traverse children
115
+ children = node.get("children")
116
+ if children:
117
+ for child_id in children:
118
+ yield from traverse_node(child_id)
119
+
120
+ # Find root nodes (nodes with no parent or parent not in mapping)
121
+ root_nodes = []
122
+ for node_id, node in mapping.items():
123
+ parent = node.get("parent")
124
+ if not parent or parent not in mapping:
125
+ root_nodes.append(node_id)
126
+
127
+ # Start traversal from root nodes
128
+ for root_id in root_nodes:
129
+ try:
130
+ yield from traverse_node(root_id)
131
+ except Exception as e:
132
+ logger.warning(f"Error traversing root node {root_id}: {e}", exc_info=True)
133
+ continue
134
+
135
+
136
+ def extract_content(content_obj: Dict[str, Any], content_type: str) -> str:
137
+ """Extract text content from content object based on content type.
138
+
139
+ Args:
140
+ content_obj: Content object from message
141
+ content_type: Type of content (text, thoughts, reasoning_recap, etc.)
142
+
143
+ Returns:
144
+ Extracted text content
145
+ """
146
+ if content_type == "text":
147
+ parts = content_obj.get("parts", [])
148
+ if isinstance(parts, list):
149
+ # Join parts, filtering out empty strings
150
+ return " ".join(str(p) for p in parts if p and str(p).strip())
151
+ return str(parts) if parts else ""
152
+
153
+ elif content_type == "thoughts":
154
+ # Extract thoughts content
155
+ thoughts = content_obj.get("thoughts", [])
156
+ if isinstance(thoughts, list):
157
+ # Extract summary or content from each thought
158
+ thought_texts = []
159
+ for thought in thoughts:
160
+ if isinstance(thought, dict):
161
+ summary = thought.get("summary", "")
162
+ content = thought.get("content", "")
163
+ if summary:
164
+ thought_texts.append(summary)
165
+ elif content:
166
+ thought_texts.append(content)
167
+ elif isinstance(thought, str):
168
+ thought_texts.append(thought)
169
+ return " ".join(thought_texts)
170
+ return str(thoughts) if thoughts else ""
171
+
172
+ elif content_type == "reasoning_recap":
173
+ # Extract reasoning recap content
174
+ recap = content_obj.get("reasoning_recap", "")
175
+ if recap:
176
+ return str(recap)
177
+ # Fallback to parts if available
178
+ parts = content_obj.get("parts", [])
179
+ if isinstance(parts, list):
180
+ return " ".join(str(p) for p in parts if p)
181
+ return ""
182
+
183
+ elif content_type == "code":
184
+ # Extract code content
185
+ code = content_obj.get("code", "")
186
+ if code:
187
+ return f"```\n{code}\n```"
188
+ # Fallback to parts
189
+ parts = content_obj.get("parts", [])
190
+ if isinstance(parts, list):
191
+ return " ".join(str(p) for p in parts if p)
192
+ return ""
193
+
194
+ elif content_type == "multimodal_text":
195
+ # Extract multimodal text (may have images, etc.)
196
+ parts = content_obj.get("parts", [])
197
+ if isinstance(parts, list):
198
+ # Filter out non-text parts
199
+ text_parts = [str(p) for p in parts if isinstance(p, str) and p.strip()]
200
+ return " ".join(text_parts)
201
+ return ""
202
+
203
+ elif content_type == "execution_output":
204
+ # Extract execution output
205
+ output = content_obj.get("output", "")
206
+ if output:
207
+ return str(output)
208
+ parts = content_obj.get("parts", [])
209
+ if isinstance(parts, list):
210
+ return " ".join(str(p) for p in parts if p)
211
+ return ""
212
+
213
+ else:
214
+ # Unknown content type - try to extract parts
215
+ logger.warning(f"Unknown content type: {content_type}")
216
+ parts = content_obj.get("parts", [])
217
+ if isinstance(parts, list):
218
+ return " ".join(str(p) for p in parts if p)
219
+ return ""
220
+
221
+
222
+ def is_conversation_format(record: Dict[str, Any]) -> bool:
223
+ """Check if a record is a ChatGPT conversation object.
224
+
225
+ Args:
226
+ record: Record to check
227
+
228
+ Returns:
229
+ True if record appears to be a conversation object
230
+ """
231
+ return (
232
+ isinstance(record, dict) and
233
+ "mapping" in record and
234
+ isinstance(record.get("mapping"), dict) and
235
+ ("conversation_id" in record or "id" in record)
236
+ )
237
+
238
+
239
+ def flatten_conversation_array(conversations: List[Dict[str, Any]], include_system: bool = False) -> Iterator[Dict[str, Any]]:
240
+ """Flatten an array of conversation objects.
241
+
242
+ Args:
243
+ conversations: List of conversation objects
244
+ include_system: Whether to include system messages
245
+
246
+ Yields:
247
+ Flattened message records
248
+ """
249
+ if not conversations:
250
+ return
251
+
252
+ for conv in conversations:
253
+ if not conv or not isinstance(conv, dict):
254
+ logger.warning(f"Skipping invalid conversation object: {type(conv)}")
255
+ continue
256
+
257
+ if not is_conversation_format(conv):
258
+ logger.warning(f"Skipping non-conversation object: {type(conv)}")
259
+ continue
260
+
261
+ try:
262
+ for record in flatten_conversation(conv, include_system=include_system):
263
+ yield record
264
+ except Exception as e:
265
+ logger.error(f"Error flattening conversation {conv.get('conversation_id', conv.get('id', 'unknown'))}: {e}", exc_info=True)
266
+ continue