topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass, fields
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from ..canonicalization.mappers import MAPPER_REGISTRY
|
|
10
|
+
from ..canonicalization.mappers.base import CanonicalMapper as LegacyCanonicalMapper
|
|
11
|
+
from ..canonicalization.mappers.base import CanonicalRecord, MappingMetadata
|
|
12
|
+
from ..canonicalization.models import CanonicalMessage
|
|
13
|
+
from ..ingestion.parsers import PARSER_REGISTRY
|
|
14
|
+
from ..ingestion.parsers.base import NormalizedRecord, Parser
|
|
15
|
+
from ..ingestion.sources.base import RawRecord
|
|
16
|
+
from ..ingestion.validation.base import ValidationResult
|
|
17
|
+
from ..sources.definitions import DataSourceDefinition
|
|
18
|
+
from ..sources.registry import REGISTRY
|
|
19
|
+
from ..storage.canonical.ai_chat.mapper import CanonicalMapper as StorageCanonicalMapper
|
|
20
|
+
from ..storage.canonical.ai_chat.mapper import register_mapper
|
|
21
|
+
from ..storage.canonical.ai_chat.model import CanonicalAIChatMessage
|
|
22
|
+
from ..storage.canonical.ai_chat import mapper as storage_mapper_module
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _maybe_parse_json(value: Any) -> Any:
|
|
26
|
+
if isinstance(value, str):
|
|
27
|
+
stripped = value.strip()
|
|
28
|
+
if stripped.startswith("{") or stripped.startswith("["):
|
|
29
|
+
try:
|
|
30
|
+
return json.loads(stripped)
|
|
31
|
+
except json.JSONDecodeError:
|
|
32
|
+
return value
|
|
33
|
+
return value
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse_source_definition_from_version_row(version_row: Dict[str, Any]) -> Dict[str, Any]:
|
|
37
|
+
source_def = _maybe_parse_json(version_row.get("source_definition_json"))
|
|
38
|
+
if not isinstance(source_def, dict):
|
|
39
|
+
raise ValueError("source_definition_json must be a JSON object in the version row")
|
|
40
|
+
|
|
41
|
+
compatibility = _maybe_parse_json(version_row.get("compatibility_json"))
|
|
42
|
+
if isinstance(compatibility, dict):
|
|
43
|
+
source_def.setdefault("parser_id", compatibility.get("parser_id"))
|
|
44
|
+
source_def.setdefault("canonical_mapper_id", compatibility.get("canonical_mapper_id"))
|
|
45
|
+
source_def.setdefault("source_type", compatibility.get("source_type"))
|
|
46
|
+
|
|
47
|
+
if version_row.get("schema_id") and not source_def.get("schema_id"):
|
|
48
|
+
source_def["schema_id"] = version_row.get("schema_id")
|
|
49
|
+
if version_row.get("source_id") and not source_def.get("source_id"):
|
|
50
|
+
source_def["source_id"] = version_row.get("source_id")
|
|
51
|
+
return source_def
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _build_source_definition(payload: Dict[str, Any]) -> DataSourceDefinition:
|
|
55
|
+
allowed = {f.name for f in fields(DataSourceDefinition)}
|
|
56
|
+
filtered = {k: v for k, v in payload.items() if k in allowed}
|
|
57
|
+
return DataSourceDefinition(**filtered)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _tokenize(path: str) -> List[str]:
|
|
61
|
+
return [part.strip() for part in str(path).split(".") if part.strip()]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _walk_step(nodes: List[Any], token: str) -> List[Any]:
|
|
65
|
+
results: List[Any] = []
|
|
66
|
+
if token == "*":
|
|
67
|
+
for node in nodes:
|
|
68
|
+
if isinstance(node, dict):
|
|
69
|
+
results.extend(node.values())
|
|
70
|
+
elif isinstance(node, list):
|
|
71
|
+
results.extend(node)
|
|
72
|
+
return results
|
|
73
|
+
|
|
74
|
+
list_mode = token.endswith("[*]")
|
|
75
|
+
key = token[:-3] if list_mode else token
|
|
76
|
+
for node in nodes:
|
|
77
|
+
if not isinstance(node, dict):
|
|
78
|
+
continue
|
|
79
|
+
if key not in node:
|
|
80
|
+
continue
|
|
81
|
+
value = node.get(key)
|
|
82
|
+
if list_mode:
|
|
83
|
+
if isinstance(value, list):
|
|
84
|
+
results.extend(value)
|
|
85
|
+
elif value is not None:
|
|
86
|
+
results.append(value)
|
|
87
|
+
else:
|
|
88
|
+
results.append(value)
|
|
89
|
+
return results
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _extract_path(payload: Dict[str, Any], path: str) -> Any:
|
|
93
|
+
if not path:
|
|
94
|
+
return None
|
|
95
|
+
nodes: List[Any] = [payload]
|
|
96
|
+
for token in _tokenize(path):
|
|
97
|
+
nodes = _walk_step(nodes, token)
|
|
98
|
+
if not nodes:
|
|
99
|
+
return None
|
|
100
|
+
if len(nodes) == 1:
|
|
101
|
+
return nodes[0]
|
|
102
|
+
return nodes
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _coerce_text(value: Any) -> str:
|
|
106
|
+
if value is None:
|
|
107
|
+
return ""
|
|
108
|
+
if isinstance(value, str):
|
|
109
|
+
return value
|
|
110
|
+
if isinstance(value, (int, float, bool)):
|
|
111
|
+
return str(value)
|
|
112
|
+
if isinstance(value, list):
|
|
113
|
+
parts = [_coerce_text(item) for item in value]
|
|
114
|
+
return "\n".join([part for part in parts if part])
|
|
115
|
+
if isinstance(value, dict):
|
|
116
|
+
return json.dumps(value, ensure_ascii=True)
|
|
117
|
+
return str(value)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _normalize_ts(value: Any) -> str:
|
|
121
|
+
if value is None:
|
|
122
|
+
return ""
|
|
123
|
+
if isinstance(value, (int, float)):
|
|
124
|
+
return datetime.fromtimestamp(float(value), tz=timezone.utc).isoformat()
|
|
125
|
+
return _coerce_text(value)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _derive_parser_extract_map_for_direct_table_passthrough(
|
|
129
|
+
source_def_payload: Dict[str, Any],
|
|
130
|
+
) -> Tuple[Dict[str, Any], bool]:
|
|
131
|
+
file_ingest_shape = source_def_payload.get("file_ingest_shape")
|
|
132
|
+
parser_extract_map: Dict[str, Any] = {}
|
|
133
|
+
if isinstance(file_ingest_shape, dict):
|
|
134
|
+
maybe_map = file_ingest_shape.get("parser_extract_map")
|
|
135
|
+
if isinstance(maybe_map, dict):
|
|
136
|
+
parser_extract_map = dict(maybe_map)
|
|
137
|
+
if parser_extract_map:
|
|
138
|
+
return parser_extract_map, False
|
|
139
|
+
|
|
140
|
+
source_type = str(source_def_payload.get("source_type") or "").strip()
|
|
141
|
+
include_data_table = bool(source_def_payload.get("pipeline_include_data_table"))
|
|
142
|
+
if source_type != "file" or not include_data_table:
|
|
143
|
+
return parser_extract_map, False
|
|
144
|
+
|
|
145
|
+
tables = source_def_payload.get("tables")
|
|
146
|
+
if not isinstance(tables, list) or not tables:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"pipeline_include_data_table=true requires tables when parser_extract_map is empty"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
column_names: List[str] = []
|
|
152
|
+
seen: set[str] = set()
|
|
153
|
+
for table in tables:
|
|
154
|
+
if not isinstance(table, dict):
|
|
155
|
+
continue
|
|
156
|
+
columns = table.get("columns")
|
|
157
|
+
if not isinstance(columns, list):
|
|
158
|
+
continue
|
|
159
|
+
for column in columns:
|
|
160
|
+
if not isinstance(column, dict):
|
|
161
|
+
continue
|
|
162
|
+
col_name = str(column.get("name") or "").strip()
|
|
163
|
+
if not col_name or col_name in seen:
|
|
164
|
+
continue
|
|
165
|
+
seen.add(col_name)
|
|
166
|
+
column_names.append(col_name)
|
|
167
|
+
|
|
168
|
+
if not column_names:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"pipeline_include_data_table=true with empty parser_extract_map requires at least one valid table column name"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
derived_map = {col_name: col_name for col_name in column_names}
|
|
174
|
+
# Common case: table uses record_id but source rows provide id.
|
|
175
|
+
if "record_id" in derived_map and "id" in seen:
|
|
176
|
+
derived_map["record_id"] = "id"
|
|
177
|
+
return derived_map, True
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _build_dynamic_parser_class(
|
|
181
|
+
*,
|
|
182
|
+
source_def: DataSourceDefinition,
|
|
183
|
+
parser_extract_map: Dict[str, Any],
|
|
184
|
+
parser_id: str,
|
|
185
|
+
requires_canonical_contract: bool,
|
|
186
|
+
direct_table_passthrough: bool,
|
|
187
|
+
):
|
|
188
|
+
class RuntimeInstalledParser(Parser):
|
|
189
|
+
def __init__(self, dataset_id: str, _schema_id: str = parser_id):
|
|
190
|
+
self.dataset_id = dataset_id
|
|
191
|
+
self._schema_id = _schema_id
|
|
192
|
+
|
|
193
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
194
|
+
payload = raw.payload
|
|
195
|
+
extracted: Dict[str, Any] = {}
|
|
196
|
+
for target_key, path in parser_extract_map.items():
|
|
197
|
+
if isinstance(path, str):
|
|
198
|
+
extracted[target_key] = _extract_path(payload, path)
|
|
199
|
+
|
|
200
|
+
if not requires_canonical_contract:
|
|
201
|
+
if (
|
|
202
|
+
direct_table_passthrough
|
|
203
|
+
and extracted
|
|
204
|
+
and isinstance(payload, dict)
|
|
205
|
+
and payload
|
|
206
|
+
and all(value is None for value in extracted.values())
|
|
207
|
+
):
|
|
208
|
+
keys_preview = sorted([str(key) for key in payload.keys()])[:10]
|
|
209
|
+
raise ValueError(
|
|
210
|
+
"Direct table passthrough could not map any table columns from payload keys "
|
|
211
|
+
f"{keys_preview}. Define file_ingest_shape.parser_extract_map explicitly."
|
|
212
|
+
)
|
|
213
|
+
if direct_table_passthrough and extracted.get("record_id") is None:
|
|
214
|
+
extracted["record_id"] = raw.record_id
|
|
215
|
+
record_id = _coerce_text(
|
|
216
|
+
extracted.get("record_id")
|
|
217
|
+
or extracted.get("id")
|
|
218
|
+
or raw.record_id
|
|
219
|
+
)
|
|
220
|
+
return NormalizedRecord(record_id=record_id, payload=extracted)
|
|
221
|
+
|
|
222
|
+
message_id = _coerce_text(extracted.get("message_id") or payload.get("id") or raw.record_id)
|
|
223
|
+
conversation_id = _coerce_text(
|
|
224
|
+
extracted.get("conversation_id")
|
|
225
|
+
or extracted.get("thread_id")
|
|
226
|
+
or payload.get("thread_id")
|
|
227
|
+
or payload.get("conversation_id")
|
|
228
|
+
or ""
|
|
229
|
+
)
|
|
230
|
+
sender_hint = _coerce_text(extracted.get("sender_type") or extracted.get("role") or payload.get("role")).lower()
|
|
231
|
+
sender_type = "human" if sender_hint in {"user", "human"} else (sender_hint or "assistant")
|
|
232
|
+
content = _coerce_text(extracted.get("content") or extracted.get("content_rendered") or payload.get("content"))
|
|
233
|
+
ts = _normalize_ts(extracted.get("event_at") or extracted.get("created_at") or payload.get("created_at") or payload.get("ts"))
|
|
234
|
+
|
|
235
|
+
normalized_payload: Dict[str, Any] = {
|
|
236
|
+
"message_id": message_id,
|
|
237
|
+
"dataset_id": self.dataset_id,
|
|
238
|
+
"thread_id": conversation_id,
|
|
239
|
+
"ts": ts,
|
|
240
|
+
"sender_type": sender_type,
|
|
241
|
+
"content": content,
|
|
242
|
+
}
|
|
243
|
+
metadata = extracted.get("metadata_json") or payload.get("metadata")
|
|
244
|
+
if metadata is not None:
|
|
245
|
+
normalized_payload["_metadata"] = metadata if isinstance(metadata, dict) else {"metadata": _coerce_text(metadata)}
|
|
246
|
+
return NormalizedRecord(record_id=message_id, payload=normalized_payload)
|
|
247
|
+
|
|
248
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
249
|
+
if not isinstance(record.payload, dict):
|
|
250
|
+
return ValidationResult(is_valid=False, errors=["Payload must be an object"], metadata={})
|
|
251
|
+
return ValidationResult(
|
|
252
|
+
is_valid=True,
|
|
253
|
+
errors=[],
|
|
254
|
+
metadata={"dynamic_parser_id": parser_id, "source_id": source_def.source_id},
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def schema_id(self) -> str:
|
|
258
|
+
return self._schema_id
|
|
259
|
+
|
|
260
|
+
RuntimeInstalledParser.__name__ = f"RuntimeInstalledParser_{source_def.source_id.replace('-', '_')}"
|
|
261
|
+
return RuntimeInstalledParser
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _build_dynamic_legacy_mapper_class(source_def: DataSourceDefinition, mapper_id: str):
|
|
265
|
+
class RuntimeInstalledLegacyMapper(LegacyCanonicalMapper):
|
|
266
|
+
version: str = "dynamic.v1"
|
|
267
|
+
|
|
268
|
+
def map(self, normalized: NormalizedRecord) -> CanonicalRecord:
|
|
269
|
+
payload = normalized.payload
|
|
270
|
+
content = _coerce_text(payload.get("content"))
|
|
271
|
+
content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
272
|
+
message_id = _coerce_text(payload.get("message_id") or normalized.record_id)
|
|
273
|
+
conversation_id = _coerce_text(payload.get("thread_id") or payload.get("conversation_id") or payload.get("dataset_id"))
|
|
274
|
+
if conversation_id and ":" not in conversation_id:
|
|
275
|
+
conversation_id = f"{source_def.source_id}:{conversation_id}"
|
|
276
|
+
metadata: Dict[str, Any] = {"mapper_version": self.version, "mapper_id": mapper_id}
|
|
277
|
+
if "_metadata" in payload:
|
|
278
|
+
metadata["_metadata"] = payload.get("_metadata")
|
|
279
|
+
canonical = CanonicalMessage(
|
|
280
|
+
message_id=message_id,
|
|
281
|
+
conversation_id=conversation_id,
|
|
282
|
+
sender_type=_coerce_text(payload.get("sender_type")),
|
|
283
|
+
content=content,
|
|
284
|
+
ts=_coerce_text(payload.get("ts")) or None,
|
|
285
|
+
source_id=source_def.source_id,
|
|
286
|
+
content_hash=content_hash,
|
|
287
|
+
metadata=metadata,
|
|
288
|
+
)
|
|
289
|
+
return CanonicalRecord(record_id=canonical.message_id, payload=canonical.__dict__)
|
|
290
|
+
|
|
291
|
+
def mapping_metadata(self, normalized: NormalizedRecord) -> MappingMetadata:
|
|
292
|
+
return MappingMetadata(source_id=source_def.source_id, mapping_version=self.version)
|
|
293
|
+
|
|
294
|
+
RuntimeInstalledLegacyMapper.__name__ = f"RuntimeInstalledLegacyMapper_{source_def.source_id.replace('-', '_')}"
|
|
295
|
+
return RuntimeInstalledLegacyMapper
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _build_dynamic_storage_mapper_class(source_def: DataSourceDefinition, mapper_id: str):
|
|
299
|
+
class RuntimeInstalledStorageMapper(StorageCanonicalMapper):
|
|
300
|
+
def map_to_canonical(self, staging_record: Dict[str, Any], source: str) -> List[CanonicalAIChatMessage]:
|
|
301
|
+
conversation_id = self.extract_conversation_id(staging_record)
|
|
302
|
+
actual_source_id = _coerce_text(staging_record.get("source_id") or source_def.source_id)
|
|
303
|
+
message = CanonicalAIChatMessage(
|
|
304
|
+
message_id=_coerce_text(staging_record.get("message_id")),
|
|
305
|
+
conversation_id=conversation_id,
|
|
306
|
+
sender_type=_coerce_text(staging_record.get("sender_type") or "assistant"),
|
|
307
|
+
sender_id=_coerce_text(staging_record.get("sender_id")) or None,
|
|
308
|
+
ts=_coerce_text(staging_record.get("ts")),
|
|
309
|
+
content=_coerce_text(staging_record.get("content")),
|
|
310
|
+
source_id=actual_source_id,
|
|
311
|
+
content_rendered=_coerce_text(staging_record.get("content_rendered")) or None,
|
|
312
|
+
metadata_json={
|
|
313
|
+
"original_source": source,
|
|
314
|
+
"mapper_id": mapper_id,
|
|
315
|
+
"thread_id": staging_record.get("thread_id"),
|
|
316
|
+
"_metadata": staging_record.get("_metadata"),
|
|
317
|
+
},
|
|
318
|
+
)
|
|
319
|
+
return [message]
|
|
320
|
+
|
|
321
|
+
def extract_conversation_id(self, staging_record: Dict[str, Any]) -> str:
|
|
322
|
+
conv = _coerce_text(staging_record.get("thread_id") or staging_record.get("conversation_id") or "")
|
|
323
|
+
if conv and ":" not in conv:
|
|
324
|
+
return f"{source_def.source_id}:{conv}"
|
|
325
|
+
return conv or f"{source_def.source_id}:{_coerce_text(staging_record.get('dataset_id'))}"
|
|
326
|
+
|
|
327
|
+
RuntimeInstalledStorageMapper.__name__ = f"RuntimeInstalledStorageMapper_{source_def.source_id.replace('-', '_')}"
|
|
328
|
+
return RuntimeInstalledStorageMapper
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@dataclass
|
|
332
|
+
class RuntimeInstallHandle:
|
|
333
|
+
source_id: str
|
|
334
|
+
parser_ids: List[str]
|
|
335
|
+
canonical_mapper_id: Optional[str]
|
|
336
|
+
_previous_source: Optional[DataSourceDefinition]
|
|
337
|
+
_previous_parsers: Dict[str, Any]
|
|
338
|
+
_previous_legacy_mapper: Optional[Any]
|
|
339
|
+
_previous_storage_mapper: Optional[Any]
|
|
340
|
+
|
|
341
|
+
def uninstall(self) -> None:
|
|
342
|
+
if self._previous_source is None:
|
|
343
|
+
REGISTRY.pop(self.source_id, None)
|
|
344
|
+
else:
|
|
345
|
+
REGISTRY[self.source_id] = self._previous_source
|
|
346
|
+
|
|
347
|
+
for parser_id in self.parser_ids:
|
|
348
|
+
previous = self._previous_parsers.get(parser_id)
|
|
349
|
+
if previous is None:
|
|
350
|
+
PARSER_REGISTRY.pop(parser_id, None)
|
|
351
|
+
else:
|
|
352
|
+
PARSER_REGISTRY[parser_id] = previous
|
|
353
|
+
|
|
354
|
+
if self.canonical_mapper_id:
|
|
355
|
+
mapper_id = self.canonical_mapper_id
|
|
356
|
+
if self._previous_legacy_mapper is None:
|
|
357
|
+
MAPPER_REGISTRY.pop(mapper_id, None)
|
|
358
|
+
else:
|
|
359
|
+
MAPPER_REGISTRY[mapper_id] = self._previous_legacy_mapper
|
|
360
|
+
|
|
361
|
+
if self._previous_storage_mapper is None:
|
|
362
|
+
storage_mapper_module._MAPPER_REGISTRY.pop(mapper_id, None)
|
|
363
|
+
else:
|
|
364
|
+
storage_mapper_module._MAPPER_REGISTRY[mapper_id] = self._previous_storage_mapper
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def install_source_definition(source_def_payload: Dict[str, Any]) -> RuntimeInstallHandle:
|
|
368
|
+
source_def = _build_source_definition(source_def_payload)
|
|
369
|
+
source_id = source_def.source_id
|
|
370
|
+
|
|
371
|
+
previous_source = REGISTRY.get(source_id)
|
|
372
|
+
REGISTRY[source_id] = source_def
|
|
373
|
+
|
|
374
|
+
canonical_mapper_id = str(source_def_payload.get("canonical_mapper_id") or "").strip()
|
|
375
|
+
canonical_group_id = str(source_def_payload.get("canonical_group_id") or "").strip()
|
|
376
|
+
canonical_mapping_connected = bool(source_def_payload.get("canonical_mapping_connected"))
|
|
377
|
+
requires_canonical_contract = bool(canonical_mapping_connected or canonical_mapper_id or canonical_group_id)
|
|
378
|
+
parser_extract_map, direct_table_passthrough = _derive_parser_extract_map_for_direct_table_passthrough(
|
|
379
|
+
source_def_payload
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
parser_ids = sorted(
|
|
383
|
+
{
|
|
384
|
+
item
|
|
385
|
+
for item in [str(source_def.schema_id or "").strip(), str(source_def.parser_id or "").strip()]
|
|
386
|
+
if item
|
|
387
|
+
}
|
|
388
|
+
)
|
|
389
|
+
parser_cls = _build_dynamic_parser_class(
|
|
390
|
+
source_def=source_def,
|
|
391
|
+
parser_extract_map=parser_extract_map,
|
|
392
|
+
parser_id=str(source_def.schema_id or source_def.parser_id),
|
|
393
|
+
requires_canonical_contract=requires_canonical_contract,
|
|
394
|
+
direct_table_passthrough=direct_table_passthrough,
|
|
395
|
+
)
|
|
396
|
+
previous_parsers: Dict[str, Any] = {}
|
|
397
|
+
for parser_id in parser_ids:
|
|
398
|
+
previous_parsers[parser_id] = PARSER_REGISTRY.get(parser_id)
|
|
399
|
+
PARSER_REGISTRY[parser_id] = parser_cls
|
|
400
|
+
|
|
401
|
+
mapper_id = str(source_def.canonical_mapper_id or "").strip() or None
|
|
402
|
+
previous_legacy_mapper = None
|
|
403
|
+
previous_storage_mapper = None
|
|
404
|
+
if mapper_id:
|
|
405
|
+
previous_legacy_mapper = MAPPER_REGISTRY.get(mapper_id)
|
|
406
|
+
legacy_mapper_cls = _build_dynamic_legacy_mapper_class(source_def, mapper_id)
|
|
407
|
+
MAPPER_REGISTRY[mapper_id] = legacy_mapper_cls
|
|
408
|
+
|
|
409
|
+
previous_storage_mapper = storage_mapper_module._MAPPER_REGISTRY.get(mapper_id)
|
|
410
|
+
storage_mapper_cls = _build_dynamic_storage_mapper_class(source_def, mapper_id)
|
|
411
|
+
register_mapper(mapper_id, storage_mapper_cls)
|
|
412
|
+
|
|
413
|
+
return RuntimeInstallHandle(
|
|
414
|
+
source_id=source_id,
|
|
415
|
+
parser_ids=parser_ids,
|
|
416
|
+
canonical_mapper_id=mapper_id,
|
|
417
|
+
_previous_source=previous_source,
|
|
418
|
+
_previous_parsers=previous_parsers,
|
|
419
|
+
_previous_legacy_mapper=previous_legacy_mapper,
|
|
420
|
+
_previous_storage_mapper=previous_storage_mapper,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def install_source_from_version_row(version_row: Dict[str, Any]) -> Tuple[RuntimeInstallHandle, Dict[str, Any]]:
|
|
425
|
+
source_def_payload = _parse_source_definition_from_version_row(version_row)
|
|
426
|
+
handle = install_source_definition(source_def_payload)
|
|
427
|
+
return handle, source_def_payload
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Storage layer for Topos."""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Canonical storage abstractions."""
|
|
2
|
+
|
|
3
|
+
from .conversations_tables import (
|
|
4
|
+
ConversationsTablesManager,
|
|
5
|
+
ensure_all_tables,
|
|
6
|
+
ensure_conversation_messages_table,
|
|
7
|
+
ensure_conversations_table,
|
|
8
|
+
)
|
|
9
|
+
from .ai_chat import CanonicalTablesManager, Canonicalizer
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ConversationsTablesManager",
|
|
13
|
+
"ensure_all_tables",
|
|
14
|
+
"ensure_conversation_messages_table",
|
|
15
|
+
"ensure_conversations_table",
|
|
16
|
+
"CanonicalTablesManager",
|
|
17
|
+
"Canonicalizer",
|
|
18
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Canonical AI chat layer - unified data models for AI chat sources.
|
|
2
|
+
|
|
3
|
+
Migrated from engine/canonical/ (commit 7b709af).
|
|
4
|
+
Maps source-specific staging data (e.g. ChatGPT) into ai_chat_messages / ai_chat_conversations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .model import CanonicalAIChatModel, CanonicalAIChatMessage, CanonicalAIChatConversation
|
|
8
|
+
from .mapper import CanonicalMapper, get_mapper, ChatGPTToAIChatMapper, StoreMessageToAIChatMapper
|
|
9
|
+
from .tables import CanonicalTablesManager
|
|
10
|
+
from .canonicalizer import Canonicalizer
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"CanonicalAIChatModel",
|
|
14
|
+
"CanonicalAIChatMessage",
|
|
15
|
+
"CanonicalAIChatConversation",
|
|
16
|
+
"CanonicalMapper",
|
|
17
|
+
"get_mapper",
|
|
18
|
+
"ChatGPTToAIChatMapper",
|
|
19
|
+
"StoreMessageToAIChatMapper",
|
|
20
|
+
"CanonicalTablesManager",
|
|
21
|
+
"Canonicalizer",
|
|
22
|
+
]
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Canonicalizer - orchestrates canonicalization of staging data.
|
|
2
|
+
|
|
3
|
+
Migrated from engine/canonical/canonicalizer.py (commit 7b709af).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from .mapper import get_mapper
|
|
13
|
+
from .model import CanonicalAIChatConversation
|
|
14
|
+
from .tables import CanonicalTablesManager
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("topos.storage.canonical.ai_chat.canonicalizer")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Canonicalizer:
|
|
20
|
+
"""Orchestrates canonicalization of staging data to canonical models."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, tables_manager: CanonicalTablesManager):
|
|
23
|
+
"""Initialize with canonical tables manager."""
|
|
24
|
+
self.tables_manager = tables_manager
|
|
25
|
+
|
|
26
|
+
def canonicalize_staging_batch(
|
|
27
|
+
self,
|
|
28
|
+
staging_records: List[Dict[str, Any]],
|
|
29
|
+
source: str,
|
|
30
|
+
batch_size: int = 1000,
|
|
31
|
+
) -> Dict[str, Any]:
|
|
32
|
+
"""Canonicalize a batch of staging records.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
staging_records: List of records from staging table
|
|
36
|
+
source: Source identifier (e.g., "chatgpt")
|
|
37
|
+
batch_size: Batch size for writing canonical records
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict with canonicalization results:
|
|
41
|
+
{
|
|
42
|
+
"conversations_created": int,
|
|
43
|
+
"messages_created": int,
|
|
44
|
+
"canonical_messages": List[Dict],
|
|
45
|
+
"errors": List[Dict]
|
|
46
|
+
}
|
|
47
|
+
"""
|
|
48
|
+
if not staging_records:
|
|
49
|
+
return {
|
|
50
|
+
"conversations_created": 0,
|
|
51
|
+
"messages_created": 0,
|
|
52
|
+
"canonical_messages": [],
|
|
53
|
+
"errors": [],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
mapper = get_mapper(source)
|
|
58
|
+
except ValueError as exc:
|
|
59
|
+
logger.error("No mapper found for source %s: %s", source, exc)
|
|
60
|
+
return {
|
|
61
|
+
"conversations_created": 0,
|
|
62
|
+
"messages_created": 0,
|
|
63
|
+
"canonical_messages": [],
|
|
64
|
+
"errors": [{"error": str(exc), "source": source}],
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
canonical_messages: List[Any] = []
|
|
68
|
+
conversation_owners: Dict[str, str] = {}
|
|
69
|
+
errors: List[Dict[str, Any]] = []
|
|
70
|
+
|
|
71
|
+
for record in staging_records:
|
|
72
|
+
try:
|
|
73
|
+
messages = mapper.map_to_canonical(record, source)
|
|
74
|
+
canonical_messages.extend(messages)
|
|
75
|
+
dataset_id = record.get("dataset_id", "")
|
|
76
|
+
owner_user_id = dataset_id.split(":")[0] if ":" in dataset_id else ""
|
|
77
|
+
for msg in messages:
|
|
78
|
+
if msg.conversation_id not in conversation_owners:
|
|
79
|
+
conversation_owners[msg.conversation_id] = owner_user_id
|
|
80
|
+
except Exception as exc:
|
|
81
|
+
logger.error("Failed to map staging record to canonical: %s", exc)
|
|
82
|
+
errors.append({
|
|
83
|
+
"record": record,
|
|
84
|
+
"error": str(exc),
|
|
85
|
+
"source": source,
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
if not canonical_messages:
|
|
89
|
+
return {
|
|
90
|
+
"conversations_created": 0,
|
|
91
|
+
"messages_created": 0,
|
|
92
|
+
"canonical_messages": [],
|
|
93
|
+
"errors": errors,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
conversations_dict: Dict[str, List[Any]] = defaultdict(list)
|
|
97
|
+
for msg in canonical_messages:
|
|
98
|
+
conversations_dict[msg.conversation_id].append(msg)
|
|
99
|
+
|
|
100
|
+
conversations: List[CanonicalAIChatConversation] = []
|
|
101
|
+
for conversation_id, messages in conversations_dict.items():
|
|
102
|
+
owner_user_id = conversation_owners.get(conversation_id, "")
|
|
103
|
+
timestamps = [msg.ts for msg in messages if msg.ts]
|
|
104
|
+
created_at = min(timestamps) if timestamps else ""
|
|
105
|
+
updated_at = max(timestamps) if timestamps else ""
|
|
106
|
+
conversation = CanonicalAIChatConversation(
|
|
107
|
+
conversation_id=conversation_id,
|
|
108
|
+
owner_user_id=owner_user_id,
|
|
109
|
+
title=None,
|
|
110
|
+
source=source,
|
|
111
|
+
created_at=created_at,
|
|
112
|
+
updated_at=updated_at,
|
|
113
|
+
)
|
|
114
|
+
conversations.append(conversation)
|
|
115
|
+
|
|
116
|
+
conversations_created = 0
|
|
117
|
+
try:
|
|
118
|
+
conversations_created = self.tables_manager.write_conversations_batch(
|
|
119
|
+
conversations, batch_size=batch_size
|
|
120
|
+
)
|
|
121
|
+
except Exception as exc:
|
|
122
|
+
logger.error("Failed to write conversations: %s", exc)
|
|
123
|
+
errors.append({"error": f"Failed to write conversations: {exc}", "source": source})
|
|
124
|
+
|
|
125
|
+
messages_created = 0
|
|
126
|
+
try:
|
|
127
|
+
messages_created = self.tables_manager.write_messages_batch(
|
|
128
|
+
canonical_messages, batch_size=batch_size
|
|
129
|
+
)
|
|
130
|
+
except Exception as exc:
|
|
131
|
+
logger.error("Failed to write messages: %s", exc)
|
|
132
|
+
errors.append({"error": f"Failed to write messages: {exc}", "source": source})
|
|
133
|
+
|
|
134
|
+
for conversation_id in conversations_dict.keys():
|
|
135
|
+
try:
|
|
136
|
+
self.tables_manager.update_message_sequences(conversation_id)
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
logger.warning("Failed to update sequences for conversation %s: %s", conversation_id, exc)
|
|
139
|
+
|
|
140
|
+
canonical_messages_dicts = [msg.to_dict() for msg in canonical_messages]
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"conversations_created": conversations_created,
|
|
144
|
+
"messages_created": messages_created,
|
|
145
|
+
"canonical_messages": canonical_messages_dicts,
|
|
146
|
+
"errors": errors,
|
|
147
|
+
}
|