topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,30 @@
1
+ """Enrichment job registry."""
2
+
3
+ from .base import BaseEnrichmentJob
4
+ from .canonical.embeddings_job import EmbeddingsJob
5
+ from .canonical.entities_job import EntitiesJob
6
+ from .canonical.emo_27_job import Emo27Job
7
+ from .canonical.sentiment_job import SentimentJob
8
+ from .canonical.topics_job import TopicsJob
9
+ from .raw.attachments_job import AttachmentsJob
10
+ from .raw.language_job import LanguageJob
11
+ from .raw.time_normalization_job import TimeNormalizationJob
12
+ from .raw.tool_calls_job import ToolCallsJob
13
+
14
+ CANONICAL_JOBS = [EntitiesJob(), TopicsJob(), SentimentJob(), EmbeddingsJob(), Emo27Job()]
15
+ RAW_JOBS = [AttachmentsJob(), ToolCallsJob(), LanguageJob(), TimeNormalizationJob()]
16
+
17
+ __all__ = [
18
+ "BaseEnrichmentJob",
19
+ "EntitiesJob",
20
+ "TopicsJob",
21
+ "SentimentJob",
22
+ "EmbeddingsJob",
23
+ "Emo27Job",
24
+ "AttachmentsJob",
25
+ "ToolCallsJob",
26
+ "LanguageJob",
27
+ "TimeNormalizationJob",
28
+ "CANONICAL_JOBS",
29
+ "RAW_JOBS",
30
+ ]
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ...utils.base_object import BaseObject
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class EnrichmentResult:
11
+ result_id: str
12
+ payload: Dict[str, str]
13
+
14
+
15
+ class EnrichmentJob:
16
+ def run(self, input_ref: str) -> EnrichmentResult:
17
+ raise NotImplementedError
18
+
19
+
20
+ class BaseEnrichmentJob(BaseObject):
21
+ """Base enrichment job for canonical messages."""
22
+
23
+ def __init__(self, *, name: Optional[str] = None) -> None:
24
+ """Initialize enrichment job with optional name.
25
+
26
+ Args:
27
+ name: Optional custom name. Defaults to `ClassName#N`
28
+ """
29
+ super().__init__(name=name)
30
+
31
+ def get_job_name(self) -> str:
32
+ raise NotImplementedError
33
+
34
+ def get_derived_table(self) -> str:
35
+ raise NotImplementedError
36
+
37
+ async def enrich(
38
+ self,
39
+ canonical_messages: List[Dict[str, Any]],
40
+ progress_callback: Optional[Callable[[int, int], None]] = None,
41
+ ) -> List[Dict[str, Any]]:
42
+ """Enrich canonical messages.
43
+
44
+ Args:
45
+ canonical_messages: List of canonical message dictionaries
46
+ progress_callback: Optional callback(current_count, total_count) called during processing
47
+
48
+ Returns:
49
+ List of enrichment result dictionaries
50
+ """
51
+ raise NotImplementedError
52
+
53
+ def should_run(self, canonical_messages: List[Dict[str, Any]]) -> bool:
54
+ return bool(canonical_messages)
@@ -0,0 +1 @@
1
+ """Canonical enrichment jobs."""
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..base import BaseEnrichmentJob
7
+
8
+ logger = logging.getLogger("topos.enrichment.jobs.embeddings")
9
+
10
+
11
+ class EmbeddingsJob(BaseEnrichmentJob):
12
+ def get_derived_table(self) -> str:
13
+ return "message_embeddings"
14
+
15
+ async def enrich(
16
+ self,
17
+ canonical_messages: List[Dict[str, Any]],
18
+ progress_callback: Optional[Callable[[int, int], None]] = None,
19
+ ) -> List[Dict[str, Any]]:
20
+ logger.debug("%s: Embeddings enrichment stub: %d messages", self, len(canonical_messages))
21
+ # Call progress callback to indicate completion (stub jobs complete instantly)
22
+ if progress_callback:
23
+ progress_callback(len(canonical_messages), len(canonical_messages))
24
+ return []
25
+
26
+ def get_job_name(self) -> str:
27
+ return "embeddings"
@@ -0,0 +1,97 @@
1
+ """Emotion classification enrichment via the Engine (HF or Ollama adapter)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from typing import Any, Callable, Dict, List, Optional
8
+
9
+ from ..base import BaseEnrichmentJob
10
+ from ...progress_bar import ProgressBar
11
+ from ....engine import Engine
12
+ from ....engine.tasks import ModelRequest, ProcessingTask
13
+
14
+ logger = logging.getLogger("topos.enrichment.jobs.emo_27")
15
+
16
+
17
+ class Emo27Job(BaseEnrichmentJob):
18
+ """Emotion classification enrichment using the Engine (HF or Ollama)."""
19
+
20
+ def __init__(self, *, name: Optional[str] = None):
21
+ super().__init__(name=name)
22
+ self._engine = Engine()
23
+
24
+ def get_derived_table(self) -> str:
25
+ return "message_emotions"
26
+
27
+ def get_job_name(self) -> str:
28
+ return "emo_27"
29
+
30
+ async def enrich(
31
+ self,
32
+ canonical_messages: List[Dict[str, Any]],
33
+ progress_callback: Optional[Callable[[int, int], None]] = None,
34
+ ) -> List[Dict[str, Any]]:
35
+ """Enrich messages with emotion classifications via Engine.run(task)."""
36
+ logger.debug("[PIPELINE:ENRICHMENT] %s: processing %d messages", self, len(canonical_messages))
37
+ results = []
38
+ total_messages = len(canonical_messages)
39
+
40
+ with ProgressBar(total=total_messages, desc=str(self)) as pbar:
41
+ for msg_idx, msg in enumerate(canonical_messages):
42
+ if msg_idx % 10 == 0:
43
+ await asyncio.sleep(0)
44
+ message_id = msg.get("message_id") or msg.get("id")
45
+ content = msg.get("content", "")
46
+ source_id = msg.get("source_id")
47
+
48
+ if not message_id or not content:
49
+ pbar.update(1)
50
+ if progress_callback:
51
+ progress_callback(msg_idx + 1, total_messages)
52
+ continue
53
+
54
+ try:
55
+ task = ProcessingTask(
56
+ id=f"emo27_{message_id}",
57
+ type="enrichment",
58
+ subtype="emotion_classification",
59
+ source_id=source_id,
60
+ record_ids=[message_id],
61
+ input={"text": content},
62
+ model_request=ModelRequest(provider="huggingface"),
63
+ )
64
+ result = await asyncio.to_thread(self._engine.run, task)
65
+ if result.status != "completed":
66
+ logger.warning(
67
+ "[PIPELINE:ENRICHMENT] %s: Engine failed for message %s: %s",
68
+ self, message_id, result.error or result.status,
69
+ )
70
+ pbar.update(1)
71
+ if progress_callback:
72
+ progress_callback(msg_idx + 1, total_messages)
73
+ continue
74
+ out = result.output
75
+ results.append({
76
+ "message_id": message_id,
77
+ "source_id": source_id,
78
+ "emotion_label": out.get("emotion_label"),
79
+ "confidence": out.get("confidence"),
80
+ "all_emotions": out.get("all_emotions", []),
81
+ "model": out.get("model", ""),
82
+ })
83
+ except Exception as e:
84
+ logger.error(
85
+ "[PIPELINE:ENRICHMENT] %s: Failed to enrich message %s: %s",
86
+ self, message_id, e,
87
+ )
88
+ pbar.update(1)
89
+ if progress_callback:
90
+ progress_callback(msg_idx + 1, total_messages)
91
+ continue
92
+ pbar.update(1)
93
+ if progress_callback:
94
+ progress_callback(msg_idx + 1, total_messages)
95
+
96
+ logger.debug("[PIPELINE:ENRICHMENT] %s: created %d results", self, len(results))
97
+ return results
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..base import BaseEnrichmentJob
7
+
8
+ logger = logging.getLogger("topos.enrichment.jobs.entities")
9
+
10
+
11
+ class EntitiesJob(BaseEnrichmentJob):
12
+ def get_derived_table(self) -> str:
13
+ return "message_entities"
14
+
15
+ async def enrich(
16
+ self,
17
+ canonical_messages: List[Dict[str, Any]],
18
+ progress_callback: Optional[Callable[[int, int], None]] = None,
19
+ ) -> List[Dict[str, Any]]:
20
+ logger.debug("%s: Entities enrichment stub: %d messages", self, len(canonical_messages))
21
+ # Call progress callback to indicate completion (stub jobs complete instantly)
22
+ if progress_callback:
23
+ progress_callback(len(canonical_messages), len(canonical_messages))
24
+ return []
25
+
26
+ def get_job_name(self) -> str:
27
+ return "entities"
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..base import BaseEnrichmentJob
7
+
8
+ logger = logging.getLogger("topos.enrichment.jobs.sentiment")
9
+
10
+
11
+ class SentimentJob(BaseEnrichmentJob):
12
+ def get_derived_table(self) -> str:
13
+ return "message_sentiment"
14
+
15
+ async def enrich(
16
+ self,
17
+ canonical_messages: List[Dict[str, Any]],
18
+ progress_callback: Optional[Callable[[int, int], None]] = None,
19
+ ) -> List[Dict[str, Any]]:
20
+ logger.debug("%s: Sentiment enrichment stub: %d messages", self, len(canonical_messages))
21
+ # Call progress callback to indicate completion (stub jobs complete instantly)
22
+ if progress_callback:
23
+ progress_callback(len(canonical_messages), len(canonical_messages))
24
+ return []
25
+
26
+ def get_job_name(self) -> str:
27
+ return "sentiment"
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..base import BaseEnrichmentJob
7
+
8
+ logger = logging.getLogger("topos.enrichment.jobs.topics")
9
+
10
+
11
+ class TopicsJob(BaseEnrichmentJob):
12
+ def get_derived_table(self) -> str:
13
+ return "message_topics"
14
+
15
+ async def enrich(
16
+ self,
17
+ canonical_messages: List[Dict[str, Any]],
18
+ progress_callback: Optional[Callable[[int, int], None]] = None,
19
+ ) -> List[Dict[str, Any]]:
20
+ logger.debug("%s: Topics enrichment stub: %d messages", self, len(canonical_messages))
21
+ # Call progress callback to indicate completion (stub jobs complete instantly)
22
+ if progress_callback:
23
+ progress_callback(len(canonical_messages), len(canonical_messages))
24
+ return []
25
+
26
+ def get_job_name(self) -> str:
27
+ return "topics"
@@ -0,0 +1 @@
1
+ """Raw enrichment jobs."""
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class AttachmentsJob:
7
+ def get_job_name(self) -> str:
8
+ return "attachments"
9
+
10
+ async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
11
+ _ = raw_records
12
+ return []
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class LanguageJob:
7
+ def get_job_name(self) -> str:
8
+ return "language"
9
+
10
+ async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
11
+ _ = raw_records
12
+ return []
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class TimeNormalizationJob:
7
+ def get_job_name(self) -> str:
8
+ return "time_normalization"
9
+
10
+ async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
11
+ _ = raw_records
12
+ return []
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class ToolCallsJob:
7
+ def get_job_name(self) -> str:
8
+ return "tool_calls"
9
+
10
+ async def run(self, raw_records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
11
+ _ = raw_records
12
+ return []
@@ -0,0 +1 @@
1
+ """Enrichment model registry scaffolding."""
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class ModelManager:
5
+ """Placeholder for model loading/management."""
6
+
7
+ def __init__(self):
8
+ self._loaded = {}
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Literal, Optional
4
+
5
+
6
+ class ModelRegistry:
7
+ """In-memory model registry. Supports HuggingFace and Ollama providers."""
8
+
9
+ def __init__(self):
10
+ self._models: Dict[str, Dict[str, Any]] = {}
11
+
12
+ def register_model(
13
+ self,
14
+ model_id: str,
15
+ model_name: str,
16
+ model_version: str,
17
+ model_type: str,
18
+ task_name: str,
19
+ huggingface_path: str = "",
20
+ is_preferred: bool = False,
21
+ metadata: Optional[Dict[str, Any]] = None,
22
+ provider: Literal["ollama", "huggingface"] = "huggingface",
23
+ ollama_model: Optional[str] = None,
24
+ ) -> None:
25
+ self._models[model_id] = {
26
+ "model_id": model_id,
27
+ "model_name": model_name,
28
+ "model_version": model_version,
29
+ "model_type": model_type,
30
+ "task_name": task_name,
31
+ "huggingface_path": huggingface_path or "",
32
+ "is_preferred": is_preferred,
33
+ "metadata": metadata or {},
34
+ "provider": provider,
35
+ "ollama_model": ollama_model,
36
+ }
37
+
38
+ def get_model(self, model_id: str) -> Optional[Dict[str, Any]]:
39
+ return self._models.get(model_id)
40
+
41
+ def list_models(self, task_name: Optional[str] = None) -> List[Dict[str, Any]]:
42
+ models = list(self._models.values())
43
+ if task_name:
44
+ models = [m for m in models if m.get("task_name") == task_name]
45
+ return models
46
+
47
+ def get_preferred_model(self, task_name: str, model_type: str) -> Optional[Dict[str, Any]]:
48
+ for model in self._models.values():
49
+ if model.get("task_name") == task_name and model.get("model_type") == model_type and model.get("is_preferred"):
50
+ return model
51
+ return None
52
+
53
+ def get_model_for_task(
54
+ self,
55
+ task_type: str,
56
+ subtype: Optional[str] = None,
57
+ source_id: Optional[str] = None,
58
+ ) -> Optional[Dict[str, Any]]:
59
+ """
60
+ Return model spec for a task (e.g. enrichment + url_classification).
61
+ Prefers is_preferred; otherwise first match by task_name or subtype.
62
+ Returns dict with huggingface_path (and later provider, ollama_model).
63
+ """
64
+ candidates = [
65
+ m for m in self._models.values()
66
+ if m.get("task_name") == task_type or (subtype and m.get("task_name") == subtype)
67
+ ]
68
+ if not candidates:
69
+ return None
70
+ preferred = [m for m in candidates if m.get("is_preferred")]
71
+ return (preferred[0] if preferred else candidates[0])
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class ModelVersioning:
5
+ """Placeholder for model versioning helpers."""
6
+
7
+ def __init__(self):
8
+ self._versions = {}
@@ -0,0 +1,177 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..utils.base_object import BaseObject
7
+ from .derived_tables import DerivedTablesManager
8
+ from .jobs import CANONICAL_JOBS, RAW_JOBS
9
+ from .jobs.base import BaseEnrichmentJob
10
+
11
+ logger = logging.getLogger("topos.enrichment.orchestrator")
12
+
13
+
14
+ class EnrichmentOrchestrator(BaseObject):
15
+ def __init__(self, tables_manager: Optional[DerivedTablesManager] = None, *, name: Optional[str] = None):
16
+ super().__init__(name=name)
17
+ self.raw_jobs = list(RAW_JOBS)
18
+ self.canonical_jobs = list(CANONICAL_JOBS)
19
+ self.tables_manager = tables_manager or DerivedTablesManager()
20
+
21
+ def register_raw_job(self, job) -> None:
22
+ self.raw_jobs.append(job)
23
+
24
+ def register_canonical_job(self, job: BaseEnrichmentJob) -> None:
25
+ self.canonical_jobs.append(job)
26
+
27
+ async def run_raw(self, raw_records: List[Dict[str, Any]]) -> Dict[str, Any]:
28
+ results = {"jobs_run": 0, "records_created": {}, "errors": []}
29
+ for job in self.raw_jobs:
30
+ try:
31
+ records = await job.run(raw_records)
32
+ results["records_created"][job.get_job_name()] = len(records)
33
+ results["jobs_run"] += 1
34
+ except Exception as exc:
35
+ results["errors"].append({"job": job.get_job_name(), "error": str(exc)})
36
+ return results
37
+
38
+ async def run_canonical(
39
+ self,
40
+ canonical_messages: List[Dict[str, Any]],
41
+ job_names: Optional[List[str]] = None,
42
+ progress_callback: Optional[Callable[[int, int, str, float, float], None]] = None,
43
+ ) -> Dict[str, Any]:
44
+ """Run canonical enrichment jobs.
45
+
46
+ Args:
47
+ canonical_messages: List of canonical message dictionaries
48
+ job_names: Optional list of specific job names to run
49
+ progress_callback: Optional callback function(processed_count, total_count, job_name) called during execution
50
+
51
+ Returns:
52
+ Results dictionary with jobs_run, records_created, errors
53
+ """
54
+ results = {"jobs_run": 0, "records_created": {}, "errors": []}
55
+ jobs_to_run = self.canonical_jobs
56
+ if job_names:
57
+ jobs_to_run = [job for job in self.canonical_jobs if job.get_job_name() in job_names]
58
+
59
+ total_messages = len(canonical_messages)
60
+ total_jobs = len(jobs_to_run)
61
+ logger.info(
62
+ "[PIPELINE:ENRICHMENT] %s: Starting enrichment: %d messages, %d jobs to run",
63
+ self,
64
+ total_messages,
65
+ total_jobs,
66
+ )
67
+
68
+ # Track messages processed across all jobs
69
+ # For progress calculation: each job processes all messages, so we track cumulative progress
70
+ messages_processed_so_far = 0
71
+
72
+ for job_idx, job in enumerate(jobs_to_run, 1):
73
+ if not job.should_run(canonical_messages):
74
+ logger.debug("[PIPELINE:ENRICHMENT] %s: Skipping job %s (should_run=False)", self, job.get_job_name())
75
+ continue
76
+ try:
77
+ job_name = job.get_job_name()
78
+ logger.info(
79
+ "[PIPELINE:ENRICHMENT] %s: Running job %d/%d: %s (%d messages, %.1f%% of jobs complete)",
80
+ self,
81
+ job_idx,
82
+ total_jobs,
83
+ job_name,
84
+ total_messages,
85
+ ((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0,
86
+ )
87
+
88
+ # Create job-level progress callback
89
+ def job_progress_callback(current_count: int, total_count: int):
90
+ """Callback for job-level progress updates."""
91
+ if progress_callback:
92
+ # Calculate job progress percent
93
+ job_progress = (current_count / total_count * 100) if total_count > 0 else 0.0
94
+ # Calculate jobs completion percent: (job_idx - 1) means previous jobs are done
95
+ jobs_percent = ((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0
96
+ # Call orchestrator progress callback with job-level info
97
+ progress_callback(
98
+ processed_count=0, # Not used for job-level tracking
99
+ total_count=total_count,
100
+ job_name=job_name,
101
+ job_percent=jobs_percent,
102
+ current_job_progress=job_progress,
103
+ )
104
+
105
+ # Call progress callback at start of job
106
+ if progress_callback:
107
+ jobs_percent = ((job_idx - 1) / total_jobs * 100) if total_jobs > 0 else 0
108
+ progress_callback(0, total_messages, job_name, jobs_percent, 0.0)
109
+
110
+ records = await job.enrich(canonical_messages, progress_callback=job_progress_callback)
111
+
112
+ # After job completes, calculate how many messages were effectively processed
113
+ # For jobs that create records, assume all messages were processed
114
+ # For jobs that return 0 records, they still "processed" the messages (just didn't create output)
115
+ messages_processed_this_job = total_messages if records else 0
116
+ messages_processed_so_far += messages_processed_this_job
117
+
118
+ if records:
119
+ # Write to derived table
120
+ table_name = job.get_derived_table()
121
+ records_written = self.tables_manager.write_enrichment_batch(
122
+ records, table_name
123
+ )
124
+ results["records_created"][table_name] = records_written
125
+ logger.info(
126
+ "[PIPELINE:ENRICHMENT] %s → %s: %d records written to %s (job %d/%d, %.1f%% complete)",
127
+ self,
128
+ job,
129
+ records_written,
130
+ table_name,
131
+ job_idx,
132
+ total_jobs,
133
+ (job_idx / total_jobs * 100) if total_jobs > 0 else 100,
134
+ )
135
+ else:
136
+ results["records_created"][job.get_derived_table()] = 0
137
+ logger.info(
138
+ "[PIPELINE:ENRICHMENT] %s → %s: completed with 0 records (job %d/%d, %.1f%% complete)",
139
+ self,
140
+ job,
141
+ job_idx,
142
+ total_jobs,
143
+ (job_idx / total_jobs * 100) if total_jobs > 0 else 100,
144
+ )
145
+
146
+ # Call progress callback after job completes
147
+ if progress_callback:
148
+ job_progress_percent = (job_idx / total_jobs * 100) if total_jobs > 0 else 100
149
+ # Update with messages processed so far (cumulative across jobs)
150
+ # Job is 100% complete
151
+ progress_callback(messages_processed_so_far, total_messages, job_name, job_progress_percent, 100.0)
152
+
153
+ results["jobs_run"] += 1
154
+ except Exception as exc:
155
+ logger.error(
156
+ "[PIPELINE:ENRICHMENT] %s → %s: failed: %s (job %d/%d)",
157
+ self,
158
+ job,
159
+ exc,
160
+ job_idx,
161
+ total_jobs,
162
+ )
163
+ results["errors"].append({"job": job.get_job_name(), "error": str(exc)})
164
+ # Still count this job's messages as "processed" (even if failed) for progress tracking
165
+ messages_processed_so_far += total_messages
166
+ if progress_callback:
167
+ job_progress_percent = (job_idx / total_jobs * 100) if total_jobs > 0 else 100
168
+ # Mark job as 100% complete (even if failed, we've moved past it)
169
+ progress_callback(messages_processed_so_far, total_messages, job.get_job_name(), job_progress_percent, 100.0)
170
+
171
+ logger.info(
172
+ "[PIPELINE:ENRICHMENT] %s: Enrichment complete: %d jobs run, %d total records created",
173
+ self,
174
+ results["jobs_run"],
175
+ sum(results["records_created"].values()),
176
+ )
177
+ return results
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from .orchestrator import EnrichmentOrchestrator
6
+
7
+
8
+ class EnrichmentProcessor:
9
+ def __init__(self, orchestrator: Optional[EnrichmentOrchestrator] = None):
10
+ self.orchestrator = orchestrator or EnrichmentOrchestrator()
11
+
12
+ async def process(
13
+ self,
14
+ canonical_messages: List[Dict[str, Any]],
15
+ job_names: Optional[List[str]] = None,
16
+ ) -> Dict[str, Any]:
17
+ return await self.orchestrator.run_canonical(canonical_messages, job_names=job_names)