topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Canonical mapper - maps staging data to canonical models.
|
|
2
|
+
|
|
3
|
+
Migrated from engine/canonical/mapper.py (commit 7b709af).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
|
|
12
|
+
from .model import CanonicalAIChatMessage, CanonicalAIChatConversation
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.storage.canonical.ai_chat.mapper")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CanonicalMapper(ABC):
|
|
18
|
+
"""Base class for mapping staging data to canonical models."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def map_to_canonical(
|
|
22
|
+
self,
|
|
23
|
+
staging_record: Dict[str, Any],
|
|
24
|
+
source: str
|
|
25
|
+
) -> List[CanonicalAIChatMessage]:
|
|
26
|
+
"""Map a staging record to canonical format.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
staging_record: Record from staging table
|
|
30
|
+
source: Source identifier (e.g., "chatgpt")
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of canonical messages (usually one, but could be multiple)
|
|
34
|
+
"""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def extract_conversation_id(self, staging_record: Dict[str, Any]) -> str:
|
|
39
|
+
"""Extract conversation ID from staging record.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
staging_record: Record from staging table
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Conversation ID (unified across sources)
|
|
46
|
+
"""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ChatGPTToAIChatMapper(CanonicalMapper):
|
|
51
|
+
"""Maps ChatGPT staging records to canonical AI chat model."""
|
|
52
|
+
|
|
53
|
+
def map_to_canonical(
|
|
54
|
+
self,
|
|
55
|
+
staging_record: Dict[str, Any],
|
|
56
|
+
source: str = "chatgpt"
|
|
57
|
+
) -> List[CanonicalAIChatMessage]:
|
|
58
|
+
"""Map ChatGPT staging record to canonical AI chat message.
|
|
59
|
+
|
|
60
|
+
ChatGPT staging format:
|
|
61
|
+
{
|
|
62
|
+
"message_id": "...",
|
|
63
|
+
"dataset_id": "...",
|
|
64
|
+
"thread_id": "...",
|
|
65
|
+
"ts": "...",
|
|
66
|
+
"sender_type": "human" | "assistant",
|
|
67
|
+
"content": "..."
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List with single canonical message
|
|
72
|
+
"""
|
|
73
|
+
conversation_id = self.extract_conversation_id(staging_record)
|
|
74
|
+
|
|
75
|
+
# Use source_id from staging_record if available (actual source_id like "chatgpt_file_ingestion"),
|
|
76
|
+
# otherwise fall back to source parameter (mapper ID like "chatgpt")
|
|
77
|
+
actual_source_id = staging_record.get("source_id") or source
|
|
78
|
+
|
|
79
|
+
# Preserve _metadata if present (for conversation tree reconstruction)
|
|
80
|
+
metadata = {
|
|
81
|
+
"thread_id": staging_record.get("thread_id"),
|
|
82
|
+
"original_source": source,
|
|
83
|
+
}
|
|
84
|
+
if "_metadata" in staging_record:
|
|
85
|
+
metadata.update(staging_record["_metadata"])
|
|
86
|
+
|
|
87
|
+
message = CanonicalAIChatMessage(
|
|
88
|
+
message_id=staging_record.get("message_id", ""),
|
|
89
|
+
conversation_id=conversation_id,
|
|
90
|
+
sender_type=staging_record.get("sender_type", ""),
|
|
91
|
+
sender_id=None,
|
|
92
|
+
ts=staging_record.get("ts", ""),
|
|
93
|
+
content=staging_record.get("content") or "",
|
|
94
|
+
content_rendered=None,
|
|
95
|
+
metadata_json=metadata,
|
|
96
|
+
seq=0,
|
|
97
|
+
source_id=actual_source_id,
|
|
98
|
+
)
|
|
99
|
+
return [message]
|
|
100
|
+
|
|
101
|
+
def extract_conversation_id(self, staging_record: Dict[str, Any]) -> str:
|
|
102
|
+
"""Extract conversation ID from ChatGPT staging record."""
|
|
103
|
+
thread_id = staging_record.get("thread_id")
|
|
104
|
+
if thread_id:
|
|
105
|
+
return f"chatgpt:{thread_id}"
|
|
106
|
+
dataset_id = staging_record.get("dataset_id", "")
|
|
107
|
+
return f"chatgpt:{dataset_id}"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class StoreMessageToAIChatMapper(CanonicalMapper):
|
|
111
|
+
"""Maps messages from the messages table (store_message) to canonical AI chat model."""
|
|
112
|
+
|
|
113
|
+
def map_to_canonical(
|
|
114
|
+
self,
|
|
115
|
+
staging_record: Dict[str, Any],
|
|
116
|
+
source: str = "store_message"
|
|
117
|
+
) -> List[CanonicalAIChatMessage]:
|
|
118
|
+
"""Map store_message record to canonical AI chat message."""
|
|
119
|
+
conversation_id = self.extract_conversation_id(staging_record)
|
|
120
|
+
message = CanonicalAIChatMessage(
|
|
121
|
+
message_id=staging_record.get("message_id", ""),
|
|
122
|
+
conversation_id=conversation_id,
|
|
123
|
+
sender_type=staging_record.get("sender_type", ""),
|
|
124
|
+
sender_id=None,
|
|
125
|
+
ts=staging_record.get("ts", ""),
|
|
126
|
+
content=staging_record.get("content") or "",
|
|
127
|
+
content_rendered=None,
|
|
128
|
+
metadata_json={
|
|
129
|
+
"thread_id": staging_record.get("thread_id"),
|
|
130
|
+
"original_source": source,
|
|
131
|
+
},
|
|
132
|
+
seq=0,
|
|
133
|
+
source_id=source,
|
|
134
|
+
)
|
|
135
|
+
return [message]
|
|
136
|
+
|
|
137
|
+
def extract_conversation_id(self, staging_record: Dict[str, Any]) -> str:
|
|
138
|
+
"""Extract conversation ID from store_message record."""
|
|
139
|
+
thread_id = staging_record.get("thread_id")
|
|
140
|
+
if thread_id:
|
|
141
|
+
return f"store_message:{thread_id}"
|
|
142
|
+
dataset_id = staging_record.get("dataset_id", "")
|
|
143
|
+
return f"store_message:{dataset_id}"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
_MAPPER_REGISTRY: Dict[str, type[CanonicalMapper]] = {
|
|
147
|
+
"chatgpt": ChatGPTToAIChatMapper,
|
|
148
|
+
"store_message": StoreMessageToAIChatMapper,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def register_mapper(source: str, mapper_class: type[CanonicalMapper]) -> None:
|
|
153
|
+
"""Register a mapper class for a source."""
|
|
154
|
+
if not issubclass(mapper_class, CanonicalMapper):
|
|
155
|
+
raise TypeError(f"Mapper class must be a subclass of CanonicalMapper, got {mapper_class}")
|
|
156
|
+
_MAPPER_REGISTRY[source] = mapper_class
|
|
157
|
+
logger.info("Registered canonical mapper for source: %s -> %s", source, mapper_class.__name__)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def get_mapper(source: str) -> CanonicalMapper:
|
|
161
|
+
"""Get a mapper instance for a source."""
|
|
162
|
+
mapper_class = _MAPPER_REGISTRY.get(source)
|
|
163
|
+
if not mapper_class:
|
|
164
|
+
available = ", ".join(_MAPPER_REGISTRY.keys())
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Unknown source for canonical mapping: {source}. Available sources: {available}"
|
|
167
|
+
)
|
|
168
|
+
return mapper_class()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Canonical data models - unified schemas for compatible sources.
|
|
2
|
+
|
|
3
|
+
Migrated from engine/canonical/model.py (commit 7b709af).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CanonicalAIChatMessage:
|
|
14
|
+
"""Canonical AI chat message model.
|
|
15
|
+
|
|
16
|
+
This is the unified format for all AI chat conversations regardless of source
|
|
17
|
+
(ChatGPT, Claude, Gemini, etc.).
|
|
18
|
+
"""
|
|
19
|
+
message_id: str
|
|
20
|
+
conversation_id: str # Unified conversation identifier
|
|
21
|
+
sender_type: str # "human" | "assistant" | "system"
|
|
22
|
+
ts: str # ISO timestamp
|
|
23
|
+
content: str # Message content
|
|
24
|
+
source_id: str = "" # Original source identifier (e.g., "chatgpt")
|
|
25
|
+
sender_id: Optional[str] = None # Optional sender identifier
|
|
26
|
+
content_rendered: Optional[str] = None # Optional rendered content
|
|
27
|
+
metadata_json: Optional[Dict[str, Any]] = None # Source-specific metadata
|
|
28
|
+
seq: int = 0 # Sequence number within conversation
|
|
29
|
+
|
|
30
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
31
|
+
"""Convert to dictionary for database storage."""
|
|
32
|
+
import json
|
|
33
|
+
return {
|
|
34
|
+
"message_id": self.message_id,
|
|
35
|
+
"conversation_id": self.conversation_id,
|
|
36
|
+
"sender_type": self.sender_type,
|
|
37
|
+
"sender_id": self.sender_id,
|
|
38
|
+
"ts": self.ts,
|
|
39
|
+
"content": self.content,
|
|
40
|
+
"content_rendered": self.content_rendered,
|
|
41
|
+
"metadata_json": json.dumps(self.metadata_json) if self.metadata_json else None,
|
|
42
|
+
"seq": self.seq,
|
|
43
|
+
"source_id": self.source_id,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class CanonicalAIChatConversation:
|
|
49
|
+
"""Canonical AI chat conversation model.
|
|
50
|
+
|
|
51
|
+
Represents a unified conversation across all AI chat sources.
|
|
52
|
+
"""
|
|
53
|
+
conversation_id: str
|
|
54
|
+
owner_user_id: str
|
|
55
|
+
title: Optional[str] = None
|
|
56
|
+
source: str = "" # Original source (e.g., "chatgpt")
|
|
57
|
+
created_at: str = "" # ISO timestamp
|
|
58
|
+
updated_at: str = "" # ISO timestamp
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""Convert to dictionary for database storage."""
|
|
62
|
+
return {
|
|
63
|
+
"conversation_id": self.conversation_id,
|
|
64
|
+
"owner_user_id": self.owner_user_id,
|
|
65
|
+
"title": self.title,
|
|
66
|
+
"source": self.source,
|
|
67
|
+
"created_at": self.created_at,
|
|
68
|
+
"updated_at": self.updated_at,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class CanonicalAIChatModel:
|
|
73
|
+
"""Canonical AI chat conversation model.
|
|
74
|
+
|
|
75
|
+
This model unifies all AI chat sources (ChatGPT, Claude, Gemini, etc.)
|
|
76
|
+
into a single canonical format.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def get_conversation_table() -> str:
|
|
81
|
+
"""Return the canonical conversations table name."""
|
|
82
|
+
return "ai_chat_conversations"
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def get_messages_table() -> str:
|
|
86
|
+
"""Return the canonical messages table name."""
|
|
87
|
+
return "ai_chat_messages"
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Canonical tables manager - manages canonical database tables.
|
|
2
|
+
|
|
3
|
+
Migrated from engine/canonical/tables.py (commit 7b709af).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import sqlite3
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
|
|
12
|
+
from .model import CanonicalAIChatMessage, CanonicalAIChatConversation
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.storage.canonical.ai_chat.tables")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CanonicalTablesManager:
|
|
18
|
+
"""Manages canonical tables for unified data models."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, conn: sqlite3.Connection):
|
|
21
|
+
"""Initialize with database connection."""
|
|
22
|
+
self.conn = conn
|
|
23
|
+
self._ensure_tables()
|
|
24
|
+
|
|
25
|
+
def _ensure_tables(self) -> None:
|
|
26
|
+
"""Ensure canonical tables exist. Creates them if they don't exist."""
|
|
27
|
+
try:
|
|
28
|
+
self.conn.execute("""
|
|
29
|
+
CREATE TABLE IF NOT EXISTS ai_chat_conversations (
|
|
30
|
+
conversation_id TEXT PRIMARY KEY,
|
|
31
|
+
owner_user_id TEXT NOT NULL,
|
|
32
|
+
title TEXT,
|
|
33
|
+
source_id TEXT NOT NULL,
|
|
34
|
+
created_at TEXT NOT NULL,
|
|
35
|
+
updated_at TEXT NOT NULL
|
|
36
|
+
)
|
|
37
|
+
""")
|
|
38
|
+
self.conn.execute("""
|
|
39
|
+
CREATE INDEX IF NOT EXISTS idx_ai_chat_conversations_owner
|
|
40
|
+
ON ai_chat_conversations(owner_user_id)
|
|
41
|
+
""")
|
|
42
|
+
self.conn.execute("""
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_ai_chat_conversations_source_id
|
|
44
|
+
ON ai_chat_conversations(source_id)
|
|
45
|
+
""")
|
|
46
|
+
self.conn.execute("""
|
|
47
|
+
CREATE TABLE IF NOT EXISTS ai_chat_messages (
|
|
48
|
+
message_id TEXT PRIMARY KEY,
|
|
49
|
+
conversation_id TEXT NOT NULL,
|
|
50
|
+
sender_type TEXT NOT NULL,
|
|
51
|
+
sender_id TEXT,
|
|
52
|
+
event_at TEXT NOT NULL,
|
|
53
|
+
content TEXT NOT NULL,
|
|
54
|
+
content_rendered TEXT,
|
|
55
|
+
metadata_json TEXT,
|
|
56
|
+
sequence INTEGER NOT NULL DEFAULT 0,
|
|
57
|
+
source_id TEXT NOT NULL
|
|
58
|
+
)
|
|
59
|
+
""")
|
|
60
|
+
self.conn.execute("""
|
|
61
|
+
CREATE INDEX IF NOT EXISTS idx_ai_chat_messages_conversation
|
|
62
|
+
ON ai_chat_messages(conversation_id, sequence)
|
|
63
|
+
""")
|
|
64
|
+
self.conn.execute("""
|
|
65
|
+
CREATE INDEX IF NOT EXISTS idx_ai_chat_messages_event_at
|
|
66
|
+
ON ai_chat_messages(event_at)
|
|
67
|
+
""")
|
|
68
|
+
self.conn.execute("""
|
|
69
|
+
CREATE INDEX IF NOT EXISTS idx_ai_chat_messages_source
|
|
70
|
+
ON ai_chat_messages(source_id)
|
|
71
|
+
""")
|
|
72
|
+
self.conn.commit()
|
|
73
|
+
logger.debug("Ensured canonical tables exist")
|
|
74
|
+
except Exception as e:
|
|
75
|
+
self.conn.rollback()
|
|
76
|
+
logger.error("Failed to ensure canonical tables: %s", e)
|
|
77
|
+
|
|
78
|
+
def write_conversations_batch(
|
|
79
|
+
self,
|
|
80
|
+
conversations: List[CanonicalAIChatConversation],
|
|
81
|
+
batch_size: int = 1000,
|
|
82
|
+
) -> int:
|
|
83
|
+
"""Write multiple conversations to canonical table in batches."""
|
|
84
|
+
if not conversations:
|
|
85
|
+
return 0
|
|
86
|
+
written = 0
|
|
87
|
+
try:
|
|
88
|
+
for i in range(0, len(conversations), batch_size):
|
|
89
|
+
batch = conversations[i:i + batch_size]
|
|
90
|
+
values = [
|
|
91
|
+
(
|
|
92
|
+
conv.conversation_id,
|
|
93
|
+
conv.owner_user_id,
|
|
94
|
+
conv.title,
|
|
95
|
+
conv.source,
|
|
96
|
+
conv.created_at,
|
|
97
|
+
conv.updated_at,
|
|
98
|
+
)
|
|
99
|
+
for conv in batch
|
|
100
|
+
]
|
|
101
|
+
self.conn.executemany("""
|
|
102
|
+
INSERT OR REPLACE INTO ai_chat_conversations (
|
|
103
|
+
conversation_id, owner_user_id, title, source_id, created_at, updated_at
|
|
104
|
+
) VALUES (?, ?, ?, ?, ?, ?)
|
|
105
|
+
""", values)
|
|
106
|
+
self.conn.commit()
|
|
107
|
+
written += len(batch)
|
|
108
|
+
logger.debug("Wrote batch of %d conversations (total: %d)", len(batch), written)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
self.conn.rollback()
|
|
111
|
+
logger.error("Failed to write conversations batch: %s", e)
|
|
112
|
+
raise
|
|
113
|
+
return written
|
|
114
|
+
|
|
115
|
+
def write_messages_batch(
|
|
116
|
+
self,
|
|
117
|
+
messages: List[CanonicalAIChatMessage],
|
|
118
|
+
batch_size: int = 1000,
|
|
119
|
+
) -> int:
|
|
120
|
+
"""Write multiple messages to canonical table in batches."""
|
|
121
|
+
if not messages:
|
|
122
|
+
return 0
|
|
123
|
+
import json
|
|
124
|
+
written = 0
|
|
125
|
+
try:
|
|
126
|
+
for i in range(0, len(messages), batch_size):
|
|
127
|
+
batch = messages[i:i + batch_size]
|
|
128
|
+
values = [
|
|
129
|
+
(
|
|
130
|
+
msg.message_id,
|
|
131
|
+
msg.conversation_id,
|
|
132
|
+
msg.sender_type,
|
|
133
|
+
msg.sender_id,
|
|
134
|
+
msg.ts,
|
|
135
|
+
msg.content,
|
|
136
|
+
msg.content_rendered,
|
|
137
|
+
json.dumps(msg.metadata_json) if msg.metadata_json else None,
|
|
138
|
+
msg.seq,
|
|
139
|
+
msg.source_id,
|
|
140
|
+
)
|
|
141
|
+
for msg in batch
|
|
142
|
+
]
|
|
143
|
+
self.conn.executemany("""
|
|
144
|
+
INSERT OR REPLACE INTO ai_chat_messages (
|
|
145
|
+
message_id, conversation_id, sender_type, sender_id, event_at,
|
|
146
|
+
content, content_rendered, metadata_json, sequence, source_id
|
|
147
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
148
|
+
""", values)
|
|
149
|
+
self.conn.commit()
|
|
150
|
+
written += len(batch)
|
|
151
|
+
logger.debug("Wrote batch of %d messages (total: %d)", len(batch), written)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
self.conn.rollback()
|
|
154
|
+
logger.error("Failed to write messages batch: %s", e)
|
|
155
|
+
raise
|
|
156
|
+
return written
|
|
157
|
+
|
|
158
|
+
def update_message_sequences(self, conversation_id: str) -> None:
|
|
159
|
+
"""Update sequence numbers for messages in a conversation."""
|
|
160
|
+
try:
|
|
161
|
+
cursor = self.conn.execute("""
|
|
162
|
+
SELECT message_id, event_at
|
|
163
|
+
FROM ai_chat_messages
|
|
164
|
+
WHERE conversation_id = ?
|
|
165
|
+
ORDER BY event_at ASC
|
|
166
|
+
""", (conversation_id,))
|
|
167
|
+
messages = cursor.fetchall()
|
|
168
|
+
for seq, (message_id, _) in enumerate(messages):
|
|
169
|
+
self.conn.execute("""
|
|
170
|
+
UPDATE ai_chat_messages
|
|
171
|
+
SET sequence = ?
|
|
172
|
+
WHERE message_id = ?
|
|
173
|
+
""", (seq, message_id))
|
|
174
|
+
self.conn.commit()
|
|
175
|
+
logger.debug("Updated sequences for conversation %s (%d messages)", conversation_id, len(messages))
|
|
176
|
+
except Exception as e:
|
|
177
|
+
self.conn.rollback()
|
|
178
|
+
logger.error("Failed to update message sequences: %s", e)
|
|
179
|
+
raise
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class CanonicalRef:
|
|
9
|
+
record_id: str
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CanonicalStore:
|
|
13
|
+
def upsert(self, record: Dict[str, str]) -> CanonicalRef:
|
|
14
|
+
raise NotImplementedError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InMemoryCanonicalStore(CanonicalStore):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self._records: Dict[str, Dict[str, str]] = {}
|
|
20
|
+
|
|
21
|
+
def upsert(self, record: Dict[str, str]) -> CanonicalRef:
|
|
22
|
+
record_id = record.get("record_id") or record.get("message_id") or ""
|
|
23
|
+
self._records[record_id] = record
|
|
24
|
+
return CanonicalRef(record_id=record_id)
|