topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""ChatGPT parser for ingestion layer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from ..log_preview import field_preview
|
|
10
|
+
from ..sources.base import RawRecord
|
|
11
|
+
from ..validation.base import ValidationResult
|
|
12
|
+
from ..validation.schema_registry import validate_schema
|
|
13
|
+
from .base import NormalizedRecord, Parser
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("topos.ingestion.parser.chatgpt")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ChatGPTParser(Parser):
|
|
20
|
+
dataset_id: str
|
|
21
|
+
_schema_id: str = "chatgpt.conversation.v1" # Default to v1, can be overridden
|
|
22
|
+
|
|
23
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
24
|
+
payload = raw.payload
|
|
25
|
+
role = payload.get("role", "").lower()
|
|
26
|
+
sender_type = "human" if role == "user" else "assistant"
|
|
27
|
+
created_at = payload.get("created_at")
|
|
28
|
+
ts = ""
|
|
29
|
+
if isinstance(created_at, (int, float)):
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
|
|
32
|
+
ts = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
|
|
33
|
+
elif isinstance(created_at, str):
|
|
34
|
+
ts = created_at
|
|
35
|
+
normalized = {
|
|
36
|
+
"message_id": payload.get("id", raw.record_id),
|
|
37
|
+
"dataset_id": self.dataset_id,
|
|
38
|
+
"thread_id": payload.get("thread_id", ""),
|
|
39
|
+
"ts": ts,
|
|
40
|
+
"sender_type": sender_type,
|
|
41
|
+
"content": payload.get("content", ""),
|
|
42
|
+
}
|
|
43
|
+
# Preserve _metadata if present (for conversation tree reconstruction)
|
|
44
|
+
if "_metadata" in payload:
|
|
45
|
+
normalized["_metadata"] = payload["_metadata"]
|
|
46
|
+
logger.debug(
|
|
47
|
+
"[PIPELINE:PARSER] Parsed record: message_id=%s, sender_type=%s, thread_id=%s, content_preview=%s",
|
|
48
|
+
normalized["message_id"],
|
|
49
|
+
normalized["sender_type"],
|
|
50
|
+
normalized["thread_id"],
|
|
51
|
+
field_preview(normalized.get("content")),
|
|
52
|
+
)
|
|
53
|
+
return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
|
|
54
|
+
|
|
55
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
56
|
+
is_valid, error = validate_schema(record.payload, self.schema_id())
|
|
57
|
+
errors = [] if is_valid else [error or "Invalid record"]
|
|
58
|
+
logger.debug(
|
|
59
|
+
"[PIPELINE:PARSER] Validation result: record_id=%s, is_valid=%s, errors=%s",
|
|
60
|
+
record.record_id,
|
|
61
|
+
is_valid,
|
|
62
|
+
errors,
|
|
63
|
+
)
|
|
64
|
+
return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
|
|
65
|
+
|
|
66
|
+
def schema_id(self) -> str:
|
|
67
|
+
return self._schema_id
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from ..sources.base import RawRecord
|
|
6
|
+
from ..validation.base import ValidationResult
|
|
7
|
+
from .base import NormalizedRecord, Parser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class GrokParser(Parser):
|
|
12
|
+
dataset_id: str
|
|
13
|
+
|
|
14
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
15
|
+
return NormalizedRecord(record_id=raw.record_id, payload=raw.payload)
|
|
16
|
+
|
|
17
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
18
|
+
return ValidationResult(is_valid=True, errors=[], metadata={})
|
|
19
|
+
|
|
20
|
+
def schema_id(self) -> str:
|
|
21
|
+
return "grok.conversation.v1"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Minimal parsers for messenger ingestion (iMessage, Signal).
|
|
2
|
+
|
|
3
|
+
Maps raw dict/row to normalized chat shape (message_id, thread_id, sender_type, content, ts).
|
|
4
|
+
Full implementation (reading from chat.db / Signal DB) is in Sprints 03 and 04.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any, Dict
|
|
12
|
+
|
|
13
|
+
from ..sources.base import RawRecord
|
|
14
|
+
from ..validation.base import ValidationResult
|
|
15
|
+
from ..validation.schema_registry import validate_schema
|
|
16
|
+
from .base import NormalizedRecord, Parser
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("topos.ingestion.parser.messenger")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize_messenger_payload(payload: Dict[str, Any], record_id: str, dataset_id: str) -> Dict[str, Any]:
|
|
22
|
+
"""Convert raw messenger record to normalized shape for conversation_messages."""
|
|
23
|
+
role = (payload.get("role") or payload.get("sender_type") or "user").lower()
|
|
24
|
+
sender_type = "human" # Preserve legacy semantics; identity is carried in sender_id.
|
|
25
|
+
created_at = payload.get("created_at") or payload.get("ts")
|
|
26
|
+
ts = ""
|
|
27
|
+
if isinstance(created_at, (int, float)):
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
try:
|
|
30
|
+
ts = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
|
|
31
|
+
except (OverflowError, OSError, ValueError):
|
|
32
|
+
# Keep ingestion resilient if a source record has an out-of-range timestamp.
|
|
33
|
+
ts = ""
|
|
34
|
+
elif isinstance(created_at, str):
|
|
35
|
+
ts = created_at
|
|
36
|
+
normalized = {
|
|
37
|
+
"message_id": str(payload.get("id") or payload.get("message_id") or record_id),
|
|
38
|
+
"dataset_id": dataset_id,
|
|
39
|
+
"thread_id": str(payload.get("thread_id") or payload.get("conversation_id") or ""),
|
|
40
|
+
"conversation_id": str(payload.get("thread_id") or payload.get("conversation_id") or ""),
|
|
41
|
+
"ts": ts,
|
|
42
|
+
"sender_type": sender_type,
|
|
43
|
+
"content": (payload.get("content") or "") or "",
|
|
44
|
+
}
|
|
45
|
+
if "_metadata" in payload:
|
|
46
|
+
normalized["_metadata"] = payload["_metadata"]
|
|
47
|
+
if payload.get("sender_id") is not None:
|
|
48
|
+
normalized["sender_id"] = str(payload["sender_id"])
|
|
49
|
+
if payload.get("reply_to_message_id") is not None:
|
|
50
|
+
normalized["reply_to_message_id"] = str(payload["reply_to_message_id"])
|
|
51
|
+
if payload.get("message_type") is not None:
|
|
52
|
+
normalized["message_type"] = str(payload["message_type"])
|
|
53
|
+
if payload.get("event_type") is not None:
|
|
54
|
+
normalized["event_type"] = str(payload["event_type"])
|
|
55
|
+
return normalized
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class ImessageParser(Parser):
|
|
60
|
+
"""Parser for iMessage normalized records (imessage.messages.v1)."""
|
|
61
|
+
|
|
62
|
+
dataset_id: str
|
|
63
|
+
_schema_id: str = "imessage.messages.v1"
|
|
64
|
+
|
|
65
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
66
|
+
payload = raw.payload
|
|
67
|
+
normalized = _normalize_messenger_payload(payload, raw.record_id, self.dataset_id)
|
|
68
|
+
return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
|
|
69
|
+
|
|
70
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
71
|
+
is_valid, error = validate_schema(record.payload, self._schema_id)
|
|
72
|
+
errors = [] if is_valid else [error or "Invalid record"]
|
|
73
|
+
return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
|
|
74
|
+
|
|
75
|
+
def schema_id(self) -> str:
|
|
76
|
+
return self._schema_id
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class SignalParser(Parser):
|
|
81
|
+
"""Parser for Signal normalized records (signal.messages.v1)."""
|
|
82
|
+
|
|
83
|
+
dataset_id: str
|
|
84
|
+
_schema_id: str = "signal.messages.v1"
|
|
85
|
+
|
|
86
|
+
def parse(self, raw: RawRecord) -> NormalizedRecord:
|
|
87
|
+
payload = raw.payload
|
|
88
|
+
normalized = _normalize_messenger_payload(payload, raw.record_id, self.dataset_id)
|
|
89
|
+
return NormalizedRecord(record_id=normalized["message_id"], payload=normalized)
|
|
90
|
+
|
|
91
|
+
def validate(self, record: RawRecord) -> ValidationResult:
|
|
92
|
+
is_valid, error = validate_schema(record.payload, self._schema_id)
|
|
93
|
+
errors = [] if is_valid else [error or "Invalid record"]
|
|
94
|
+
return ValidationResult(is_valid=is_valid, errors=errors, metadata={})
|
|
95
|
+
|
|
96
|
+
def schema_id(self) -> str:
|
|
97
|
+
return self._schema_id
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Progress tracking for ingestion jobs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IngestionProgress:
|
|
10
|
+
def __init__(self, job_id: str, records_total: Optional[int] = None):
|
|
11
|
+
self.job_id = job_id
|
|
12
|
+
self.records_total = records_total
|
|
13
|
+
self.records_processed = 0
|
|
14
|
+
self.start_time = time.time()
|
|
15
|
+
self.last_update_time = self.start_time
|
|
16
|
+
self.current_step = "parsing"
|
|
17
|
+
self.errors_count = 0
|
|
18
|
+
|
|
19
|
+
def update(self, records_processed: int, current_step: Optional[str] = None) -> None:
|
|
20
|
+
self.records_processed = records_processed
|
|
21
|
+
self.last_update_time = time.time()
|
|
22
|
+
if current_step:
|
|
23
|
+
self.current_step = current_step
|
|
24
|
+
|
|
25
|
+
def get_progress_percent(self) -> float:
|
|
26
|
+
if not self.records_total:
|
|
27
|
+
return 0.0
|
|
28
|
+
return min(100.0, (self.records_processed / self.records_total) * 100.0)
|
|
29
|
+
|
|
30
|
+
def get_estimated_seconds_remaining(self) -> Optional[int]:
|
|
31
|
+
if not self.records_total or self.records_processed == 0:
|
|
32
|
+
return None
|
|
33
|
+
elapsed = time.time() - self.start_time
|
|
34
|
+
if elapsed <= 0:
|
|
35
|
+
return None
|
|
36
|
+
rate = self.records_processed / elapsed
|
|
37
|
+
if rate <= 0:
|
|
38
|
+
return None
|
|
39
|
+
remaining = self.records_total - self.records_processed
|
|
40
|
+
return int(remaining / rate)
|
|
41
|
+
|
|
42
|
+
def should_report(self, min_interval: float = 1.0) -> bool:
|
|
43
|
+
return (time.time() - self.last_update_time) >= min_interval
|
|
44
|
+
|
|
45
|
+
def to_dict(self) -> dict:
|
|
46
|
+
return {
|
|
47
|
+
"job_id": self.job_id,
|
|
48
|
+
"progress_percent": self.get_progress_percent(),
|
|
49
|
+
"records_processed": self.records_processed,
|
|
50
|
+
"records_total": self.records_total,
|
|
51
|
+
"estimated_seconds_remaining": self.get_estimated_seconds_remaining(),
|
|
52
|
+
"current_step": self.current_step,
|
|
53
|
+
"errors_count": self.errors_count,
|
|
54
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Source connector registry."""
|
|
2
|
+
|
|
3
|
+
from .base import SourceConnector
|
|
4
|
+
from .calendar import CalendarSourceConnector
|
|
5
|
+
from .chatgpt import ChatGPTSourceConnector
|
|
6
|
+
from .grok import GrokSourceConnector
|
|
7
|
+
|
|
8
|
+
SOURCE_REGISTRY = {
|
|
9
|
+
"chatgpt": ChatGPTSourceConnector,
|
|
10
|
+
"grok": GrokSourceConnector,
|
|
11
|
+
"calendar": CalendarSourceConnector,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"SourceConnector",
|
|
16
|
+
"ChatGPTSourceConnector",
|
|
17
|
+
"GrokSourceConnector",
|
|
18
|
+
"CalendarSourceConnector",
|
|
19
|
+
"SOURCE_REGISTRY",
|
|
20
|
+
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Literal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class SourcePayload:
|
|
9
|
+
payload: Dict[str, str]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class RawRecord:
|
|
14
|
+
record_id: str
|
|
15
|
+
payload: Dict[str, str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class SourceIdentity:
|
|
20
|
+
source_system: str
|
|
21
|
+
source_record_id: str
|
|
22
|
+
source_export_id: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SourceConnector:
|
|
26
|
+
source_name: str
|
|
27
|
+
source_type: Literal["file", "sqlite"]
|
|
28
|
+
|
|
29
|
+
def ingest(self, payload: SourcePayload) -> str:
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def schema(self) -> Dict[str, str]:
|
|
33
|
+
raise NotImplementedError
|
|
34
|
+
|
|
35
|
+
def identity(self, record: RawRecord) -> SourceIdentity:
|
|
36
|
+
raise NotImplementedError
|
|
37
|
+
|
|
38
|
+
def canonical_eligible(self) -> bool:
|
|
39
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Literal
|
|
5
|
+
|
|
6
|
+
from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class CalendarSourceConnector(SourceConnector):
|
|
11
|
+
source_name: str = "calendar"
|
|
12
|
+
source_type: Literal["file", "sqlite"] = "file"
|
|
13
|
+
|
|
14
|
+
def ingest(self, payload: SourcePayload) -> str:
|
|
15
|
+
_ = payload
|
|
16
|
+
return "calendar.events.v1"
|
|
17
|
+
|
|
18
|
+
def schema(self) -> Dict[str, str]:
|
|
19
|
+
return {"schema_id": "calendar.events.v1"}
|
|
20
|
+
|
|
21
|
+
def identity(self, record: RawRecord) -> SourceIdentity:
|
|
22
|
+
return SourceIdentity(
|
|
23
|
+
source_system="calendar",
|
|
24
|
+
source_record_id=record.record_id,
|
|
25
|
+
source_export_id=record.record_id,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def canonical_eligible(self) -> bool:
|
|
29
|
+
return False
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Literal
|
|
5
|
+
|
|
6
|
+
from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ChatGPTSourceConnector(SourceConnector):
|
|
11
|
+
source_name: str = "chatgpt"
|
|
12
|
+
source_type: Literal["file", "sqlite"] = "file"
|
|
13
|
+
|
|
14
|
+
def ingest(self, payload: SourcePayload) -> str:
|
|
15
|
+
_ = payload
|
|
16
|
+
return "chatgpt.conversation.v1"
|
|
17
|
+
|
|
18
|
+
def schema(self) -> Dict[str, str]:
|
|
19
|
+
return {"schema_id": "chatgpt.conversation.v1"}
|
|
20
|
+
|
|
21
|
+
def identity(self, record: RawRecord) -> SourceIdentity:
|
|
22
|
+
return SourceIdentity(
|
|
23
|
+
source_system="chatgpt",
|
|
24
|
+
source_record_id=record.record_id,
|
|
25
|
+
source_export_id=record.record_id,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def canonical_eligible(self) -> bool:
|
|
29
|
+
return True
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import platform
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import ssl
|
|
9
|
+
import subprocess
|
|
10
|
+
import time
|
|
11
|
+
import urllib.error
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import urllib.request
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
GOOGLE_DEVICE_CODE_URL = "https://oauth2.googleapis.com/device/code"
|
|
18
|
+
GOOGLE_TOKEN_URL = "https://oauth2.googleapis.com/token"
|
|
19
|
+
GOOGLE_PEOPLE_CONNECTIONS_URL = "https://people.googleapis.com/v1/people/me/connections"
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("topos.ingestion.sources.contact_importers")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_ssl_context() -> ssl.SSLContext:
|
|
25
|
+
"""
|
|
26
|
+
Build a verified TLS context for outbound Google API calls.
|
|
27
|
+
|
|
28
|
+
On some macOS Python runtimes, default trust roots are missing and
|
|
29
|
+
certificate validation fails. Prefer certifi when available.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
import certifi
|
|
33
|
+
|
|
34
|
+
return ssl.create_default_context(cafile=certifi.where())
|
|
35
|
+
except Exception:
|
|
36
|
+
return ssl.create_default_context()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _http_post_form(url: str, body: Dict[str, Any]) -> Dict[str, Any]:
|
|
40
|
+
payload = urllib.parse.urlencode({k: v for k, v in body.items() if v is not None}).encode("utf-8")
|
|
41
|
+
req = urllib.request.Request(
|
|
42
|
+
url=url,
|
|
43
|
+
data=payload,
|
|
44
|
+
method="POST",
|
|
45
|
+
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
|
46
|
+
)
|
|
47
|
+
try:
|
|
48
|
+
with urllib.request.urlopen(req, timeout=30, context=_build_ssl_context()) as resp:
|
|
49
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
50
|
+
except urllib.error.HTTPError as e:
|
|
51
|
+
raw = e.read().decode("utf-8", errors="replace")
|
|
52
|
+
try:
|
|
53
|
+
return json.loads(raw)
|
|
54
|
+
except Exception:
|
|
55
|
+
return {"error": f"http_{e.code}", "error_description": raw}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _http_get_json(url: str, bearer_token: str) -> Dict[str, Any]:
|
|
59
|
+
req = urllib.request.Request(
|
|
60
|
+
url=url,
|
|
61
|
+
method="GET",
|
|
62
|
+
headers={"Authorization": f"Bearer {bearer_token}"},
|
|
63
|
+
)
|
|
64
|
+
try:
|
|
65
|
+
with urllib.request.urlopen(req, timeout=30, context=_build_ssl_context()) as resp:
|
|
66
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
67
|
+
except urllib.error.HTTPError as e:
|
|
68
|
+
raw = e.read().decode("utf-8", errors="replace")
|
|
69
|
+
try:
|
|
70
|
+
return json.loads(raw)
|
|
71
|
+
except Exception:
|
|
72
|
+
raise RuntimeError(f"Google API request failed: HTTP {e.code}: {raw[:500]}") from e
|
|
73
|
+
except urllib.error.URLError as e:
|
|
74
|
+
raise RuntimeError(
|
|
75
|
+
f"Google API TLS/network error: {e}. "
|
|
76
|
+
"If this is a certificate verify failure, ensure certifi is installed in the engine environment."
|
|
77
|
+
) from e
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _normalize_phone(value: Any) -> str:
|
|
81
|
+
s = str(value or "").strip()
|
|
82
|
+
if not s:
|
|
83
|
+
return ""
|
|
84
|
+
# Preserve a leading + where present, drop formatting characters.
|
|
85
|
+
plus = s.startswith("+")
|
|
86
|
+
digits = re.sub(r"[^\d]", "", s)
|
|
87
|
+
if not digits:
|
|
88
|
+
return ""
|
|
89
|
+
return f"+{digits}" if plus else digits
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def import_apple_contacts_local() -> List[Dict[str, Any]]:
|
|
93
|
+
"""
|
|
94
|
+
Read Apple Contacts locally on macOS via JXA (osascript JavaScript bridge).
|
|
95
|
+
Returns normalized records: [{"display_name": str, "identifiers": [{"type","identifier"}]}]
|
|
96
|
+
"""
|
|
97
|
+
current_platform = platform.system().lower()
|
|
98
|
+
logger.info("[CONTACT_IMPORT] Apple import start: platform=%s", current_platform)
|
|
99
|
+
if current_platform != "darwin":
|
|
100
|
+
raise RuntimeError("Apple Contacts import is only available on macOS")
|
|
101
|
+
osascript_bin = shutil.which("osascript")
|
|
102
|
+
logger.info("[CONTACT_IMPORT] Apple import environment: osascript=%s", osascript_bin or "missing")
|
|
103
|
+
if not osascript_bin:
|
|
104
|
+
raise RuntimeError("osascript not found in PATH; Apple Contacts import requires macOS host runtime")
|
|
105
|
+
|
|
106
|
+
jxa_script = r"""
|
|
107
|
+
const app = Application("Contacts");
|
|
108
|
+
const people = app.people();
|
|
109
|
+
const out = [];
|
|
110
|
+
for (let i = 0; i < people.length; i++) {
|
|
111
|
+
const p = people[i];
|
|
112
|
+
const name = (() => { try { return String(p.name() || "").trim(); } catch (_) { return ""; } })();
|
|
113
|
+
const phones = [];
|
|
114
|
+
const emails = [];
|
|
115
|
+
try {
|
|
116
|
+
const ph = p.phones();
|
|
117
|
+
for (let j = 0; j < ph.length; j++) {
|
|
118
|
+
const val = String(ph[j].value() || "").trim();
|
|
119
|
+
if (val) phones.push(val);
|
|
120
|
+
}
|
|
121
|
+
} catch (_) {}
|
|
122
|
+
try {
|
|
123
|
+
const em = p.emails();
|
|
124
|
+
for (let j = 0; j < em.length; j++) {
|
|
125
|
+
const val = String(em[j].value() || "").trim();
|
|
126
|
+
if (val) emails.push(val);
|
|
127
|
+
}
|
|
128
|
+
} catch (_) {}
|
|
129
|
+
if (name || phones.length || emails.length) out.push({ name, phones, emails });
|
|
130
|
+
}
|
|
131
|
+
JSON.stringify(out);
|
|
132
|
+
"""
|
|
133
|
+
proc = subprocess.run(
|
|
134
|
+
["osascript", "-l", "JavaScript", "-e", jxa_script],
|
|
135
|
+
capture_output=True,
|
|
136
|
+
text=True,
|
|
137
|
+
check=False,
|
|
138
|
+
)
|
|
139
|
+
if proc.returncode != 0:
|
|
140
|
+
stderr = (proc.stderr or "").strip()
|
|
141
|
+
stdout = (proc.stdout or "").strip()
|
|
142
|
+
logger.error(
|
|
143
|
+
"[CONTACT_IMPORT] Apple import osascript failed: returncode=%s stderr=%s stdout=%s",
|
|
144
|
+
proc.returncode,
|
|
145
|
+
stderr[:500],
|
|
146
|
+
stdout[:500],
|
|
147
|
+
)
|
|
148
|
+
raise RuntimeError((stderr or stdout or "Failed to read Apple Contacts").strip())
|
|
149
|
+
try:
|
|
150
|
+
raw_items = json.loads(proc.stdout or "[]")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
raise RuntimeError(f"Failed to parse Apple Contacts output: {e}") from e
|
|
153
|
+
|
|
154
|
+
imported: List[Dict[str, Any]] = []
|
|
155
|
+
for item in raw_items:
|
|
156
|
+
name = str((item or {}).get("name") or "").strip()
|
|
157
|
+
identifiers: List[Dict[str, str]] = []
|
|
158
|
+
for p in (item or {}).get("phones") or []:
|
|
159
|
+
phone = _normalize_phone(p)
|
|
160
|
+
if phone:
|
|
161
|
+
identifiers.append({"type": "phone", "identifier": phone})
|
|
162
|
+
for e in (item or {}).get("emails") or []:
|
|
163
|
+
email = str(e or "").strip().lower()
|
|
164
|
+
if email:
|
|
165
|
+
identifiers.append({"type": "email", "identifier": email})
|
|
166
|
+
if identifiers:
|
|
167
|
+
imported.append({"display_name": name or None, "identifiers": identifiers})
|
|
168
|
+
logger.info(
|
|
169
|
+
"[CONTACT_IMPORT] Apple import complete: raw_contacts=%d imported_contacts=%d",
|
|
170
|
+
len(raw_items),
|
|
171
|
+
len(imported),
|
|
172
|
+
)
|
|
173
|
+
return imported
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def start_google_device_auth(client_id: str) -> Dict[str, Any]:
|
|
177
|
+
if not str(client_id or "").strip():
|
|
178
|
+
raise RuntimeError("google_client_id is required")
|
|
179
|
+
logger.info("[CONTACT_IMPORT] Google device auth start requested")
|
|
180
|
+
result = _http_post_form(
|
|
181
|
+
GOOGLE_DEVICE_CODE_URL,
|
|
182
|
+
{
|
|
183
|
+
"client_id": client_id.strip(),
|
|
184
|
+
"scope": "openid https://www.googleapis.com/auth/contacts.readonly",
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
if result.get("error"):
|
|
188
|
+
logger.warning(
|
|
189
|
+
"[CONTACT_IMPORT] Google device auth start failed: error=%s description=%s",
|
|
190
|
+
result.get("error"),
|
|
191
|
+
result.get("error_description"),
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
logger.info("[CONTACT_IMPORT] Google device auth start succeeded")
|
|
195
|
+
return result
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def finish_google_device_auth(
|
|
199
|
+
*,
|
|
200
|
+
client_id: str,
|
|
201
|
+
device_code: str,
|
|
202
|
+
interval_seconds: int = 5,
|
|
203
|
+
timeout_seconds: int = 120,
|
|
204
|
+
) -> Dict[str, Any]:
|
|
205
|
+
logger.info(
|
|
206
|
+
"[CONTACT_IMPORT] Google device auth finish polling start: interval=%s timeout=%s",
|
|
207
|
+
interval_seconds,
|
|
208
|
+
timeout_seconds,
|
|
209
|
+
)
|
|
210
|
+
started = time.time()
|
|
211
|
+
interval = max(2, int(interval_seconds or 5))
|
|
212
|
+
while True:
|
|
213
|
+
result = _http_post_form(
|
|
214
|
+
GOOGLE_TOKEN_URL,
|
|
215
|
+
{
|
|
216
|
+
"client_id": client_id.strip(),
|
|
217
|
+
"device_code": device_code,
|
|
218
|
+
"grant_type": "urn:ietf:params:oauth:grant-type:device_code",
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
if result.get("access_token"):
|
|
222
|
+
logger.info("[CONTACT_IMPORT] Google device auth finish succeeded")
|
|
223
|
+
return result
|
|
224
|
+
if result.get("error") not in {"authorization_pending", "slow_down"}:
|
|
225
|
+
logger.warning(
|
|
226
|
+
"[CONTACT_IMPORT] Google device auth finish failed: error=%s description=%s",
|
|
227
|
+
result.get("error"),
|
|
228
|
+
result.get("error_description"),
|
|
229
|
+
)
|
|
230
|
+
return result
|
|
231
|
+
if time.time() - started > max(30, int(timeout_seconds)):
|
|
232
|
+
return {"error": "authorization_timeout", "error_description": "Timed out waiting for Google authorization"}
|
|
233
|
+
if result.get("error") == "slow_down":
|
|
234
|
+
interval += 2
|
|
235
|
+
time.sleep(interval)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def import_google_contacts(access_token: str) -> List[Dict[str, Any]]:
|
|
239
|
+
logger.info("[CONTACT_IMPORT] Google contacts fetch start")
|
|
240
|
+
imported: List[Dict[str, Any]] = []
|
|
241
|
+
page_token: Optional[str] = None
|
|
242
|
+
for _ in range(20):
|
|
243
|
+
params = {
|
|
244
|
+
"pageSize": "1000",
|
|
245
|
+
"personFields": "names,emailAddresses,phoneNumbers",
|
|
246
|
+
"sortOrder": "LAST_MODIFIED_ASCENDING",
|
|
247
|
+
}
|
|
248
|
+
if page_token:
|
|
249
|
+
params["pageToken"] = page_token
|
|
250
|
+
url = f"{GOOGLE_PEOPLE_CONNECTIONS_URL}?{urllib.parse.urlencode(params)}"
|
|
251
|
+
data = _http_get_json(url, access_token)
|
|
252
|
+
for person in data.get("connections") or []:
|
|
253
|
+
name = None
|
|
254
|
+
for n in person.get("names") or []:
|
|
255
|
+
disp = str(n.get("displayName") or "").strip()
|
|
256
|
+
if disp:
|
|
257
|
+
name = disp
|
|
258
|
+
break
|
|
259
|
+
identifiers: List[Dict[str, str]] = []
|
|
260
|
+
for p in person.get("phoneNumbers") or []:
|
|
261
|
+
phone = _normalize_phone(p.get("value"))
|
|
262
|
+
if phone:
|
|
263
|
+
identifiers.append({"type": "phone", "identifier": phone})
|
|
264
|
+
for e in person.get("emailAddresses") or []:
|
|
265
|
+
email = str(e.get("value") or "").strip().lower()
|
|
266
|
+
if email:
|
|
267
|
+
identifiers.append({"type": "email", "identifier": email})
|
|
268
|
+
if identifiers:
|
|
269
|
+
imported.append({"display_name": name, "identifiers": identifiers})
|
|
270
|
+
page_token = data.get("nextPageToken")
|
|
271
|
+
if not page_token:
|
|
272
|
+
break
|
|
273
|
+
logger.info("[CONTACT_IMPORT] Google contacts fetch complete: imported_contacts=%d", len(imported))
|
|
274
|
+
return imported
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Literal
|
|
5
|
+
|
|
6
|
+
from .base import RawRecord, SourceConnector, SourceIdentity, SourcePayload
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class GrokSourceConnector(SourceConnector):
|
|
11
|
+
source_name: str = "grok"
|
|
12
|
+
source_type: Literal["file", "sqlite"] = "file"
|
|
13
|
+
|
|
14
|
+
def ingest(self, payload: SourcePayload) -> str:
|
|
15
|
+
_ = payload
|
|
16
|
+
return "grok.conversation.v1"
|
|
17
|
+
|
|
18
|
+
def schema(self) -> Dict[str, str]:
|
|
19
|
+
return {"schema_id": "grok.conversation.v1"}
|
|
20
|
+
|
|
21
|
+
def identity(self, record: RawRecord) -> SourceIdentity:
|
|
22
|
+
return SourceIdentity(
|
|
23
|
+
source_system="grok",
|
|
24
|
+
source_record_id=record.record_id,
|
|
25
|
+
source_export_id=record.record_id,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def canonical_eligible(self) -> bool:
|
|
29
|
+
return False
|