topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Enrichment job registry."""
|
|
2
|
+
|
|
3
|
+
from .base import BaseEnrichmentJob
|
|
4
|
+
from .canonical.embeddings_job import EmbeddingsJob
|
|
5
|
+
from .canonical.entities_job import EntitiesJob
|
|
6
|
+
from .canonical.emo_27_job import Emo27Job
|
|
7
|
+
from .canonical.sentiment_job import SentimentJob
|
|
8
|
+
from .canonical.topics_job import TopicsJob
|
|
9
|
+
from .raw.attachments_job import AttachmentsJob
|
|
10
|
+
from .raw.language_job import LanguageJob
|
|
11
|
+
from .raw.time_normalization_job import TimeNormalizationJob
|
|
12
|
+
from .raw.tool_calls_job import ToolCallsJob
|
|
13
|
+
|
|
14
|
+
CANONICAL_JOBS = [EntitiesJob(), TopicsJob(), SentimentJob(), EmbeddingsJob(), Emo27Job()]
|
|
15
|
+
RAW_JOBS = [AttachmentsJob(), ToolCallsJob(), LanguageJob(), TimeNormalizationJob()]
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BaseEnrichmentJob",
|
|
19
|
+
"EntitiesJob",
|
|
20
|
+
"TopicsJob",
|
|
21
|
+
"SentimentJob",
|
|
22
|
+
"EmbeddingsJob",
|
|
23
|
+
"Emo27Job",
|
|
24
|
+
"AttachmentsJob",
|
|
25
|
+
"ToolCallsJob",
|
|
26
|
+
"LanguageJob",
|
|
27
|
+
"TimeNormalizationJob",
|
|
28
|
+
"CANONICAL_JOBS",
|
|
29
|
+
"RAW_JOBS",
|
|
30
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ...utils.base_object import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class EnrichmentResult:
|
|
11
|
+
result_id: str
|
|
12
|
+
payload: Dict[str, str]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EnrichmentJob:
|
|
16
|
+
def run(self, input_ref: str) -> EnrichmentResult:
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BaseEnrichmentJob(BaseObject):
|
|
21
|
+
"""Base enrichment job for canonical messages."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, *, name: Optional[str] = None) -> None:
|
|
24
|
+
"""Initialize enrichment job with optional name.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
name: Optional custom name. Defaults to `ClassName#N`
|
|
28
|
+
"""
|
|
29
|
+
super().__init__(name=name)
|
|
30
|
+
|
|
31
|
+
def get_job_name(self) -> str:
|
|
32
|
+
raise NotImplementedError
|
|
33
|
+
|
|
34
|
+
def get_derived_table(self) -> str:
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
async def enrich(
|
|
38
|
+
self,
|
|
39
|
+
canonical_messages: List[Dict[str, Any]],
|
|
40
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
41
|
+
) -> List[Dict[str, Any]]:
|
|
42
|
+
"""Enrich canonical messages.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
canonical_messages: List of canonical message dictionaries
|
|
46
|
+
progress_callback: Optional callback(current_count, total_count) called during processing
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of enrichment result dictionaries
|
|
50
|
+
"""
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
def should_run(self, canonical_messages: List[Dict[str, Any]]) -> bool:
|
|
54
|
+
return bool(canonical_messages)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Canonical enrichment jobs."""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..base import BaseEnrichmentJob
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.enrichment.jobs.embeddings")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EmbeddingsJob(BaseEnrichmentJob):
|
|
12
|
+
def get_derived_table(self) -> str:
|
|
13
|
+
return "message_embeddings"
|
|
14
|
+
|
|
15
|
+
async def enrich(
|
|
16
|
+
self,
|
|
17
|
+
canonical_messages: List[Dict[str, Any]],
|
|
18
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
19
|
+
) -> List[Dict[str, Any]]:
|
|
20
|
+
logger.debug("%s: Embeddings enrichment stub: %d messages", self, len(canonical_messages))
|
|
21
|
+
# Call progress callback to indicate completion (stub jobs complete instantly)
|
|
22
|
+
if progress_callback:
|
|
23
|
+
progress_callback(len(canonical_messages), len(canonical_messages))
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
def get_job_name(self) -> str:
|
|
27
|
+
return "embeddings"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Emotion classification enrichment via the Engine (HF or Ollama adapter)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from ..base import BaseEnrichmentJob
|
|
10
|
+
from ...progress_bar import ProgressBar
|
|
11
|
+
from ....engine import Engine
|
|
12
|
+
from ....engine.tasks import ModelRequest, ProcessingTask
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.enrichment.jobs.emo_27")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Emo27Job(BaseEnrichmentJob):
|
|
18
|
+
"""Emotion classification enrichment using the Engine (HF or Ollama)."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, *, name: Optional[str] = None):
|
|
21
|
+
super().__init__(name=name)
|
|
22
|
+
self._engine = Engine()
|
|
23
|
+
|
|
24
|
+
def get_derived_table(self) -> str:
|
|
25
|
+
return "message_emotions"
|
|
26
|
+
|
|
27
|
+
def get_job_name(self) -> str:
|
|
28
|
+
return "emo_27"
|
|
29
|
+
|
|
30
|
+
async def enrich(
|
|
31
|
+
self,
|
|
32
|
+
canonical_messages: List[Dict[str, Any]],
|
|
33
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
34
|
+
) -> List[Dict[str, Any]]:
|
|
35
|
+
"""Enrich messages with emotion classifications via Engine.run(task)."""
|
|
36
|
+
logger.debug("[PIPELINE:ENRICHMENT] %s: processing %d messages", self, len(canonical_messages))
|
|
37
|
+
results = []
|
|
38
|
+
total_messages = len(canonical_messages)
|
|
39
|
+
|
|
40
|
+
with ProgressBar(total=total_messages, desc=str(self)) as pbar:
|
|
41
|
+
for msg_idx, msg in enumerate(canonical_messages):
|
|
42
|
+
if msg_idx % 10 == 0:
|
|
43
|
+
await asyncio.sleep(0)
|
|
44
|
+
message_id = msg.get("message_id") or msg.get("id")
|
|
45
|
+
content = msg.get("content", "")
|
|
46
|
+
source_id = msg.get("source_id")
|
|
47
|
+
|
|
48
|
+
if not message_id or not content:
|
|
49
|
+
pbar.update(1)
|
|
50
|
+
if progress_callback:
|
|
51
|
+
progress_callback(msg_idx + 1, total_messages)
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
task = ProcessingTask(
|
|
56
|
+
id=f"emo27_{message_id}",
|
|
57
|
+
type="enrichment",
|
|
58
|
+
subtype="emotion_classification",
|
|
59
|
+
source_id=source_id,
|
|
60
|
+
record_ids=[message_id],
|
|
61
|
+
input={"text": content},
|
|
62
|
+
model_request=ModelRequest(provider="huggingface"),
|
|
63
|
+
)
|
|
64
|
+
result = await asyncio.to_thread(self._engine.run, task)
|
|
65
|
+
if result.status != "completed":
|
|
66
|
+
logger.warning(
|
|
67
|
+
"[PIPELINE:ENRICHMENT] %s: Engine failed for message %s: %s",
|
|
68
|
+
self, message_id, result.error or result.status,
|
|
69
|
+
)
|
|
70
|
+
pbar.update(1)
|
|
71
|
+
if progress_callback:
|
|
72
|
+
progress_callback(msg_idx + 1, total_messages)
|
|
73
|
+
continue
|
|
74
|
+
out = result.output
|
|
75
|
+
results.append({
|
|
76
|
+
"message_id": message_id,
|
|
77
|
+
"source_id": source_id,
|
|
78
|
+
"emotion_label": out.get("emotion_label"),
|
|
79
|
+
"confidence": out.get("confidence"),
|
|
80
|
+
"all_emotions": out.get("all_emotions", []),
|
|
81
|
+
"model": out.get("model", ""),
|
|
82
|
+
})
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.error(
|
|
85
|
+
"[PIPELINE:ENRICHMENT] %s: Failed to enrich message %s: %s",
|
|
86
|
+
self, message_id, e,
|
|
87
|
+
)
|
|
88
|
+
pbar.update(1)
|
|
89
|
+
if progress_callback:
|
|
90
|
+
progress_callback(msg_idx + 1, total_messages)
|
|
91
|
+
continue
|
|
92
|
+
pbar.update(1)
|
|
93
|
+
if progress_callback:
|
|
94
|
+
progress_callback(msg_idx + 1, total_messages)
|
|
95
|
+
|
|
96
|
+
logger.debug("[PIPELINE:ENRICHMENT] %s: created %d results", self, len(results))
|
|
97
|
+
return results
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..base import BaseEnrichmentJob
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.enrichment.jobs.entities")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EntitiesJob(BaseEnrichmentJob):
|
|
12
|
+
def get_derived_table(self) -> str:
|
|
13
|
+
return "message_entities"
|
|
14
|
+
|
|
15
|
+
async def enrich(
|
|
16
|
+
self,
|
|
17
|
+
canonical_messages: List[Dict[str, Any]],
|
|
18
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
19
|
+
) -> List[Dict[str, Any]]:
|
|
20
|
+
logger.debug("%s: Entities enrichment stub: %d messages", self, len(canonical_messages))
|
|
21
|
+
# Call progress callback to indicate completion (stub jobs complete instantly)
|
|
22
|
+
if progress_callback:
|
|
23
|
+
progress_callback(len(canonical_messages), len(canonical_messages))
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
def get_job_name(self) -> str:
|
|
27
|
+
return "entities"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..base import BaseEnrichmentJob
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.enrichment.jobs.sentiment")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SentimentJob(BaseEnrichmentJob):
|
|
12
|
+
def get_derived_table(self) -> str:
|
|
13
|
+
return "message_sentiment"
|
|
14
|
+
|
|
15
|
+
async def enrich(
|
|
16
|
+
self,
|
|
17
|
+
canonical_messages: List[Dict[str, Any]],
|
|
18
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
19
|
+
) -> List[Dict[str, Any]]:
|
|
20
|
+
logger.debug("%s: Sentiment enrichment stub: %d messages", self, len(canonical_messages))
|
|
21
|
+
# Call progress callback to indicate completion (stub jobs complete instantly)
|
|
22
|
+
if progress_callback:
|
|
23
|
+
progress_callback(len(canonical_messages), len(canonical_messages))
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
def get_job_name(self) -> str:
|
|
27
|
+
return "sentiment"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..base import BaseEnrichmentJob
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.enrichment.jobs.topics")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TopicsJob(BaseEnrichmentJob):
|
|
12
|
+
def get_derived_table(self) -> str:
|
|
13
|
+
return "message_topics"
|
|
14
|
+
|
|
15
|
+
async def enrich(
|
|
16
|
+
self,
|
|
17
|
+
canonical_messages: List[Dict[str, Any]],
|
|
18
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
19
|
+
) -> List[Dict[str, Any]]:
|
|
20
|
+
logger.debug("%s: Topics enrichment stub: %d messages", self, len(canonical_messages))
|
|
21
|
+
# Call progress callback to indicate completion (stub jobs complete instantly)
|
|
22
|
+
if progress_callback:
|
|
23
|
+
progress_callback(len(canonical_messages), len(canonical_messages))
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
def get_job_name(self) -> str:
|
|
27
|
+
return "topics"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Raw enrichment jobs."""
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AttachmentsJob:
|
|
7
|
+
def get_job_name(self) -> str:
|
|
8
|
+
return "attachments"
|
|
9
|
+
|
|
10
|
+
async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
11
|
+
_ = raw_records
|
|
12
|
+
return []
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LanguageJob:
|
|
7
|
+
def get_job_name(self) -> str:
|
|
8
|
+
return "language"
|
|
9
|
+
|
|
10
|
+
async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
11
|
+
_ = raw_records
|
|
12
|
+
return []
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TimeNormalizationJob:
|
|
7
|
+
def get_job_name(self) -> str:
|
|
8
|
+
return "time_normalization"
|
|
9
|
+
|
|
10
|
+
async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
11
|
+
_ = raw_records
|
|
12
|
+
return []
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ToolCallsJob:
|
|
7
|
+
def get_job_name(self) -> str:
|
|
8
|
+
return "tool_calls"
|
|
9
|
+
|
|
10
|
+
async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
11
|
+
_ = raw_records
|
|
12
|
+
return []
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Enrichment model registry scaffolding."""
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ModelRegistry:
|
|
7
|
+
"""In-memory model registry. Supports HuggingFace and Ollama providers."""
|
|
8
|
+
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self._models: Dict[str, Dict[str, Any]] = {}
|
|
11
|
+
|
|
12
|
+
def register_model(
|
|
13
|
+
self,
|
|
14
|
+
model_id: str,
|
|
15
|
+
model_name: str,
|
|
16
|
+
model_version: str,
|
|
17
|
+
model_type: str,
|
|
18
|
+
task_name: str,
|
|
19
|
+
huggingface_path: str = "",
|
|
20
|
+
is_preferred: bool = False,
|
|
21
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
22
|
+
provider: Literal["ollama", "huggingface"] = "huggingface",
|
|
23
|
+
ollama_model: Optional[str] = None,
|
|
24
|
+
) -> None:
|
|
25
|
+
self._models[model_id] = {
|
|
26
|
+
"model_id": model_id,
|
|
27
|
+
"model_name": model_name,
|
|
28
|
+
"model_version": model_version,
|
|
29
|
+
"model_type": model_type,
|
|
30
|
+
"task_name": task_name,
|
|
31
|
+
"huggingface_path": huggingface_path or "",
|
|
32
|
+
"is_preferred": is_preferred,
|
|
33
|
+
"metadata": metadata or {},
|
|
34
|
+
"provider": provider,
|
|
35
|
+
"ollama_model": ollama_model,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def get_model(self, model_id: str) -> Optional[Dict[str, Any]]:
|
|
39
|
+
return self._models.get(model_id)
|
|
40
|
+
|
|
41
|
+
def list_models(self, task_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
42
|
+
models = list(self._models.values())
|
|
43
|
+
if task_name:
|
|
44
|
+
models = [m for m in models if m.get("task_name") == task_name]
|
|
45
|
+
return models
|
|
46
|
+
|
|
47
|
+
def get_preferred_model(self, task_name: str, model_type: str) -> Optional[Dict[str, Any]]:
|
|
48
|
+
for model in self._models.values():
|
|
49
|
+
if model.get("task_name") == task_name and model.get("model_type") == model_type and model.get("is_preferred"):
|
|
50
|
+
return model
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
def get_model_for_task(
|
|
54
|
+
self,
|
|
55
|
+
task_type: str,
|
|
56
|
+
subtype: Optional[str] = None,
|
|
57
|
+
source_id: Optional[str] = None,
|
|
58
|
+
) -> Optional[Dict[str, Any]]:
|
|
59
|
+
"""
|
|
60
|
+
Return model spec for a task (e.g. enrichment + url_classification).
|
|
61
|
+
Prefers is_preferred; otherwise first match by task_name or subtype.
|
|
62
|
+
Returns dict with huggingface_path (and later provider, ollama_model).
|
|
63
|
+
"""
|
|
64
|
+
candidates = [
|
|
65
|
+
m for m in self._models.values()
|
|
66
|
+
if m.get("task_name") == task_type or (subtype and m.get("task_name") == subtype)
|
|
67
|
+
]
|
|
68
|
+
if not candidates:
|
|
69
|
+
return None
|
|
70
|
+
preferred = [m for m in candidates if m.get("is_preferred")]
|
|
71
|
+
return (preferred[0] if preferred else candidates[0])
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..utils.base_object import BaseObject
|
|
7
|
+
from .derived_tables import DerivedTablesManager
|
|
8
|
+
from .jobs import CANONICAL_JOBS, RAW_JOBS
|
|
9
|
+
from .jobs.base import BaseEnrichmentJob
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("topos.enrichment.orchestrator")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EnrichmentOrchestrator(BaseObject):
|
|
15
|
+
def __init__(self, tables_manager: Optional[DerivedTablesManager] = None, *, name: Optional[str] = None):
|
|
16
|
+
super().__init__(name=name)
|
|
17
|
+
self.raw_jobs = list(RAW_JOBS)
|
|
18
|
+
self.canonical_jobs = list(CANONICAL_JOBS)
|
|
19
|
+
self.tables_manager = tables_manager or DerivedTablesManager()
|
|
20
|
+
|
|
21
|
+
def register_raw_job(self, job) -> None:
|
|
22
|
+
self.raw_jobs.append(job)
|
|
23
|
+
|
|
24
|
+
def register_canonical_job(self, job: BaseEnrichmentJob) -> None:
|
|
25
|
+
self.canonical_jobs.append(job)
|
|
26
|
+
|
|
27
|
+
async def run_raw(self, raw_records: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
28
|
+
results = {"jobs_run": 0, "records_created": {}, "errors": []}
|
|
29
|
+
for job in self.raw_jobs:
|
|
30
|
+
try:
|
|
31
|
+
records = await job.run(raw_records)
|
|
32
|
+
results["records_created"][job.get_job_name()] = len(records)
|
|
33
|
+
results["jobs_run"] += 1
|
|
34
|
+
except Exception as exc:
|
|
35
|
+
results["errors"].append({"job": job.get_job_name(), "error": str(exc)})
|
|
36
|
+
return results
|
|
37
|
+
|
|
38
|
+
async def run_canonical(
|
|
39
|
+
self,
|
|
40
|
+
canonical_messages: List[Dict[str, Any]],
|
|
41
|
+
job_names: Optional[List[str]] = None,
|
|
42
|
+
progress_callback: Optional[Callable[[int, int, str, float, float], None]] = None,
|
|
43
|
+
) -> Dict[str, Any]:
|
|
44
|
+
"""Run canonical enrichment jobs.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
canonical_messages: List of canonical message dictionaries
|
|
48
|
+
job_names: Optional list of specific job names to run
|
|
49
|
+
progress_callback: Optional callback function(processed_count, total_count, job_name) called during execution
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Results dictionary with jobs_run, records_created, errors
|
|
53
|
+
"""
|
|
54
|
+
results = {"jobs_run": 0, "records_created": {}, "errors": []}
|
|
55
|
+
jobs_to_run = self.canonical_jobs
|
|
56
|
+
if job_names:
|
|
57
|
+
jobs_to_run = [job for job in self.canonical_jobs if job.get_job_name() in job_names]
|
|
58
|
+
|
|
59
|
+
total_messages = len(canonical_messages)
|
|
60
|
+
total_jobs = len(jobs_to_run)
|
|
61
|
+
logger.info(
|
|
62
|
+
"[PIPELINE:ENRICHMENT] %s: Starting enrichment: %d messages, %d jobs to run",
|
|
63
|
+
self,
|
|
64
|
+
total_messages,
|
|
65
|
+
total_jobs,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Track messages processed across all jobs
|
|
69
|
+
# For progress calculation: each job processes all messages, so we track cumulative progress
|
|
70
|
+
messages_processed_so_far = 0
|
|
71
|
+
|
|
72
|
+
for job_idx, job in enumerate(jobs_to_run, 1):
|
|
73
|
+
if not job.should_run(canonical_messages):
|
|
74
|
+
logger.debug("[PIPELINE:ENRICHMENT] %s: Skipping job %s (should_run=False)", self, job.get_job_name())
|
|
75
|
+
continue
|
|
76
|
+
try:
|
|
77
|
+
job_name = job.get_job_name()
|
|
78
|
+
logger.info(
|
|
79
|
+
"[PIPELINE:ENRICHMENT] %s: Running job %d/%d: %s (%d messages, %.1f%% of jobs complete)",
|
|
80
|
+
self,
|
|
81
|
+
job_idx,
|
|
82
|
+
total_jobs,
|
|
83
|
+
job_name,
|
|
84
|
+
total_messages,
|
|
85
|
+
((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Create job-level progress callback
|
|
89
|
+
def job_progress_callback(current_count: int, total_count: int):
|
|
90
|
+
"""Callback for job-level progress updates."""
|
|
91
|
+
if progress_callback:
|
|
92
|
+
# Calculate job progress percent
|
|
93
|
+
job_progress = (current_count / total_count * 100) if total_count > 0 else 0.0
|
|
94
|
+
# Calculate jobs completion percent: (job_idx - 1) means previous jobs are done
|
|
95
|
+
jobs_percent = ((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0
|
|
96
|
+
# Call orchestrator progress callback with job-level info
|
|
97
|
+
progress_callback(
|
|
98
|
+
processed_count=0, # Not used for job-level tracking
|
|
99
|
+
total_count=total_count,
|
|
100
|
+
job_name=job_name,
|
|
101
|
+
job_percent=jobs_percent,
|
|
102
|
+
current_job_progress=job_progress,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Call progress callback at start of job
|
|
106
|
+
if progress_callback:
|
|
107
|
+
jobs_percent = ((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0
|
|
108
|
+
progress_callback(0, total_messages, job_name, jobs_percent, 0.0)
|
|
109
|
+
|
|
110
|
+
records = await job.enrich(canonical_messages, progress_callback=job_progress_callback)
|
|
111
|
+
|
|
112
|
+
# After job completes, calculate how many messages were effectively processed
|
|
113
|
+
# For jobs that create records, assume all messages were processed
|
|
114
|
+
# For jobs that return 0 records, they still "processed" the messages (just didn't create output)
|
|
115
|
+
messages_processed_this_job = total_messages if records else 0
|
|
116
|
+
messages_processed_so_far += messages_processed_this_job
|
|
117
|
+
|
|
118
|
+
if records:
|
|
119
|
+
# Write to derived table
|
|
120
|
+
table_name = job.get_derived_table()
|
|
121
|
+
records_written = self.tables_manager.write_enrichment_batch(
|
|
122
|
+
records, table_name
|
|
123
|
+
)
|
|
124
|
+
results["records_created"][table_name] = records_written
|
|
125
|
+
logger.info(
|
|
126
|
+
"[PIPELINE:ENRICHMENT] %s → %s: %d records written to %s (job %d/%d, %.1f%% complete)",
|
|
127
|
+
self,
|
|
128
|
+
job,
|
|
129
|
+
records_written,
|
|
130
|
+
table_name,
|
|
131
|
+
job_idx,
|
|
132
|
+
total_jobs,
|
|
133
|
+
(job_idx / total_jobs * 100) if total_jobs > 0 else 100,
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
results["records_created"][job.get_derived_table()] = 0
|
|
137
|
+
logger.info(
|
|
138
|
+
"[PIPELINE:ENRICHMENT] %s → %s: completed with 0 records (job %d/%d, %.1f%% complete)",
|
|
139
|
+
self,
|
|
140
|
+
job,
|
|
141
|
+
job_idx,
|
|
142
|
+
total_jobs,
|
|
143
|
+
(job_idx / total_jobs * 100) if total_jobs > 0 else 100,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Call progress callback after job completes
|
|
147
|
+
if progress_callback:
|
|
148
|
+
job_progress_percent = (job_idx / total_jobs * 100) if total_jobs > 0 else 100
|
|
149
|
+
# Update with messages processed so far (cumulative across jobs)
|
|
150
|
+
# Job is 100% complete
|
|
151
|
+
progress_callback(messages_processed_so_far, total_messages, job_name, job_progress_percent, 100.0)
|
|
152
|
+
|
|
153
|
+
results["jobs_run"] += 1
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
logger.error(
|
|
156
|
+
"[PIPELINE:ENRICHMENT] %s → %s: failed: %s (job %d/%d)",
|
|
157
|
+
self,
|
|
158
|
+
job,
|
|
159
|
+
exc,
|
|
160
|
+
job_idx,
|
|
161
|
+
total_jobs,
|
|
162
|
+
)
|
|
163
|
+
results["errors"].append({"job": job.get_job_name(), "error": str(exc)})
|
|
164
|
+
# Still count this job's messages as "processed" (even if failed) for progress tracking
|
|
165
|
+
messages_processed_so_far += total_messages
|
|
166
|
+
if progress_callback:
|
|
167
|
+
job_progress_percent = (job_idx / total_jobs * 100) if total_jobs > 0 else 100
|
|
168
|
+
# Mark job as 100% complete (even if failed, we've moved past it)
|
|
169
|
+
progress_callback(messages_processed_so_far, total_messages, job.get_job_name(), job_progress_percent, 100.0)
|
|
170
|
+
|
|
171
|
+
logger.info(
|
|
172
|
+
"[PIPELINE:ENRICHMENT] %s: Enrichment complete: %d jobs run, %d total records created",
|
|
173
|
+
self,
|
|
174
|
+
results["jobs_run"],
|
|
175
|
+
sum(results["records_created"].values()),
|
|
176
|
+
)
|
|
177
|
+
return results
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from .orchestrator import EnrichmentOrchestrator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EnrichmentProcessor:
|
|
9
|
+
def __init__(self, orchestrator: Optional[EnrichmentOrchestrator] = None):
|
|
10
|
+
self.orchestrator = orchestrator or EnrichmentOrchestrator()
|
|
11
|
+
|
|
12
|
+
async def process(
|
|
13
|
+
self,
|
|
14
|
+
canonical_messages: List[Dict[str, Any]],
|
|
15
|
+
job_names: Optional[List[str]] = None,
|
|
16
|
+
) -> Dict[str, Any]:
|
|
17
|
+
return await self.orchestrator.run_canonical(canonical_messages, job_names=job_names)
|