topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,45 @@
1
+ """Coarse table layer labels for list_database_tables (system / raw / enrichment / canonical)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, Tuple
6
+
7
+ TableLayerKind = str # "system" | "raw" | "enrichment" | "canonical"
8
+
9
+ _LAYER_LABELS: Dict[TableLayerKind, str] = {
10
+ "system": "Topos system",
11
+ "raw": "Raw",
12
+ "enrichment": "Enrichment",
13
+ "canonical": "Canonical",
14
+ }
15
+
16
+ # Maps engine list_database_tables category keys to coarse layer kinds.
17
+ _CATEGORY_TO_LAYER: Dict[str, TableLayerKind] = {
18
+ "system": "system",
19
+ "enrichment_system": "system",
20
+ "raw_retention": "raw",
21
+ "raw_enrichment": "enrichment",
22
+ "browser_flat": "raw",
23
+ "source": "raw",
24
+ "canonical": "canonical",
25
+ "canonical_enrichment": "enrichment",
26
+ "other": "raw",
27
+ }
28
+
29
+
30
+ def layer_kind_for_category(category_key: str) -> TableLayerKind:
31
+ key = (category_key or "").strip()
32
+ return _CATEGORY_TO_LAYER.get(key, "raw")
33
+
34
+
35
+ def layer_label_for_kind(layer_kind: str) -> str:
36
+ return _LAYER_LABELS.get((layer_kind or "").strip(), "Raw")
37
+
38
+
39
+ def layer_for_category(category_key: str) -> Tuple[TableLayerKind, str]:
40
+ kind = layer_kind_for_category(category_key)
41
+ return kind, layer_label_for_kind(kind)
42
+
43
+
44
+ def layer_kind_labels() -> Dict[TableLayerKind, str]:
45
+ return dict(_LAYER_LABELS)
topos/core/types.py ADDED
@@ -0,0 +1,13 @@
1
+ """Shared types for Topos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, TypedDict
6
+
7
+ JsonDict = Dict[str, Any]
8
+
9
+
10
+ class HealthStatus(TypedDict):
11
+ status: str
12
+ time: float
13
+ cloud_connected: bool | None
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import sqlite3
6
+ from typing import Any, Dict, Optional
7
+
8
+ from topos.core.state import get_engine_config_value, set_engine_config_value
9
+
10
+ logger = logging.getLogger("topos.data_explorer_table_prefs")
11
+
12
+ PREFS_VERSION = 1
13
+ MAX_TABLE_NAME_LEN = 256
14
+ MAX_USER_ID_LEN = 256
15
+ MAX_PREFS_BYTES = 32 * 1024
16
+ MIN_COLUMN_WIDTH = 80
17
+ MAX_COLUMN_WIDTH = 480
18
+
19
+
20
+ def build_table_prefs_config_key(user_id: str, table_name: str) -> str:
21
+ uid = str(user_id or "").strip()
22
+ name = str(table_name or "").strip()
23
+ return f"data_explorer_table_prefs:v1:{uid}:{name}"
24
+
25
+
26
+ def _clamp_column_width(width: Any, fallback: int = 160) -> int:
27
+ try:
28
+ n = float(width)
29
+ except (TypeError, ValueError):
30
+ n = float(fallback)
31
+ if n != n or n < 0:
32
+ n = float(fallback)
33
+ return int(min(MAX_COLUMN_WIDTH, max(MIN_COLUMN_WIDTH, n)))
34
+
35
+
36
+ def normalize_table_prefs_payload(raw: Any) -> Dict[str, Any]:
37
+ if not isinstance(raw, dict):
38
+ raise ValueError("INVALID_PREFS")
39
+ column_widths_raw = raw.get("columnWidths")
40
+ if not isinstance(column_widths_raw, dict):
41
+ raise ValueError("INVALID_PREFS")
42
+ column_widths: Dict[str, int] = {}
43
+ for key, width in column_widths_raw.items():
44
+ col = str(key).strip()
45
+ if not col:
46
+ continue
47
+ column_widths[col] = _clamp_column_width(width)
48
+ prefs: Dict[str, Any] = {"columnWidths": column_widths}
49
+ sort = raw.get("sort")
50
+ if sort is not None:
51
+ if not isinstance(sort, dict):
52
+ raise ValueError("INVALID_SORT")
53
+ column_id = str(sort.get("columnId") or "").strip()
54
+ direction = str(sort.get("direction") or "").strip().lower()
55
+ if not column_id or direction not in {"asc", "desc"}:
56
+ raise ValueError("INVALID_SORT")
57
+ prefs["sort"] = {"columnId": column_id, "direction": direction}
58
+ for field in ("hiddenColumns", "columnOrder"):
59
+ values = raw.get(field)
60
+ if values is None:
61
+ continue
62
+ if not isinstance(values, list):
63
+ raise ValueError("INVALID_PREFS")
64
+ cleaned = [str(v).strip() for v in values if str(v).strip()]
65
+ if cleaned:
66
+ prefs[field] = cleaned
67
+ envelope = {"version": PREFS_VERSION, **prefs}
68
+ serialized = json.dumps(envelope, separators=(",", ":"), default=str)
69
+ if len(serialized.encode("utf-8")) > MAX_PREFS_BYTES:
70
+ raise ValueError("PREFS_TOO_LARGE")
71
+ return envelope
72
+
73
+
74
+ def _validate_table_name(table_name: str) -> str:
75
+ name = str(table_name or "").strip()
76
+ if not name or len(name) > MAX_TABLE_NAME_LEN:
77
+ raise ValueError("INVALID_TABLE_NAME")
78
+ return name
79
+
80
+
81
+ def _validate_user_id(user_id: str) -> str:
82
+ uid = str(user_id or "").strip()
83
+ if not uid or len(uid) > MAX_USER_ID_LEN:
84
+ raise ValueError("INVALID_USER_ID")
85
+ return uid
86
+
87
+
88
+ def get_table_prefs(
89
+ conn: sqlite3.Connection,
90
+ *,
91
+ user_id: str,
92
+ table_name: str,
93
+ ) -> Optional[Dict[str, Any]]:
94
+ uid = _validate_user_id(user_id)
95
+ name = _validate_table_name(table_name)
96
+ raw = get_engine_config_value(conn, build_table_prefs_config_key(uid, name))
97
+ if not raw:
98
+ return None
99
+ try:
100
+ parsed = json.loads(raw)
101
+ except json.JSONDecodeError:
102
+ return None
103
+ if not isinstance(parsed, dict):
104
+ return None
105
+ try:
106
+ return normalize_table_prefs_payload(parsed)
107
+ except ValueError:
108
+ return None
109
+
110
+
111
+ def put_table_prefs(
112
+ conn: sqlite3.Connection,
113
+ *,
114
+ user_id: str,
115
+ table_name: str,
116
+ prefs: Dict[str, Any],
117
+ ) -> Dict[str, Any]:
118
+ uid = _validate_user_id(user_id)
119
+ name = _validate_table_name(table_name)
120
+ normalized = normalize_table_prefs_payload(prefs)
121
+ set_engine_config_value(conn, build_table_prefs_config_key(uid, name), json.dumps(normalized))
122
+ logger.info(
123
+ "data_explorer_table_prefs_saved user_id=%s table_name=%s columns=%d",
124
+ uid[:8],
125
+ name,
126
+ len(normalized.get("columnWidths") or {}),
127
+ )
128
+ return normalized
129
+
130
+
131
+ def delete_table_prefs(
132
+ conn: sqlite3.Connection,
133
+ *,
134
+ user_id: str,
135
+ table_name: str,
136
+ ) -> bool:
137
+ uid = _validate_user_id(user_id)
138
+ name = _validate_table_name(table_name)
139
+ key = build_table_prefs_config_key(uid, name)
140
+ existing = get_engine_config_value(conn, key)
141
+ if not existing:
142
+ return False
143
+ conn.execute("DELETE FROM engine_config WHERE key = ?", (key,))
144
+ conn.commit()
145
+ logger.info(
146
+ "data_explorer_table_prefs_deleted user_id=%s table_name=%s",
147
+ uid[:8],
148
+ name,
149
+ )
150
+ return True
@@ -0,0 +1,29 @@
1
+ """Topos Engine: unified runtime for ML/LLM processing (enrichments, transformations, queries)."""
2
+
3
+ from .engine import Engine
4
+ from .tasks import (
5
+ ExecutionMeta,
6
+ ExecutionSpec,
7
+ ModelRequest,
8
+ ProcessingResult,
9
+ ProcessingTask,
10
+ Provenance,
11
+ RequestedBy,
12
+ TaskOptions,
13
+ build_task,
14
+ build_url_classification_task,
15
+ )
16
+
17
+ __all__ = [
18
+ "Engine",
19
+ "ProcessingTask",
20
+ "ProcessingResult",
21
+ "ModelRequest",
22
+ "ExecutionSpec",
23
+ "TaskOptions",
24
+ "RequestedBy",
25
+ "Provenance",
26
+ "ExecutionMeta",
27
+ "build_task",
28
+ "build_url_classification_task",
29
+ ]
@@ -0,0 +1,50 @@
1
+ """Backend adapters for model inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ from typing import Any, Dict, Optional
7
+
8
+ from .base import BackendAdapter
9
+ from .huggingface import HuggingFaceAdapter
10
+ from .ollama import OllamaAdapter
11
+ from .stub import StubBackendAdapter, get_stub_adapter
12
+
13
+ _huggingface_singleton: HuggingFaceAdapter | None = None
14
+ _ollama_singleton: OllamaAdapter | None = None
15
+ _huggingface_lock = threading.Lock()
16
+ _ollama_lock = threading.Lock()
17
+
18
+ __all__ = [
19
+ "BackendAdapter",
20
+ "HuggingFaceAdapter",
21
+ "OllamaAdapter",
22
+ "StubBackendAdapter",
23
+ "get_stub_adapter",
24
+ "get_huggingface_adapter",
25
+ "get_ollama_adapter",
26
+ ]
27
+
28
+
29
+ def get_huggingface_adapter() -> HuggingFaceAdapter:
30
+ """Return the shared HuggingFace adapter (loads models on first use, cached afterward)."""
31
+ global _huggingface_singleton
32
+ if _huggingface_singleton is not None:
33
+ return _huggingface_singleton
34
+ with _huggingface_lock:
35
+ if _huggingface_singleton is None:
36
+ _huggingface_singleton = HuggingFaceAdapter()
37
+ return _huggingface_singleton
38
+
39
+
40
+ def get_ollama_adapter() -> OllamaAdapter:
41
+ """Return the shared Ollama adapter (uses ENGINE_OLLAMA_BASE_URL from config)."""
42
+ global _ollama_singleton
43
+ if _ollama_singleton is not None:
44
+ return _ollama_singleton
45
+ with _ollama_lock:
46
+ if _ollama_singleton is None:
47
+ _ollama_singleton = OllamaAdapter()
48
+ return _ollama_singleton
49
+
50
+
@@ -0,0 +1,21 @@
1
+ """Backend adapter protocol for model inference (PRD §8.2)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional, Protocol
6
+
7
+
8
+ class BackendAdapter(Protocol):
9
+ """Protocol for model backends (Ollama, HuggingFace)."""
10
+
11
+ def load_model(self, model_name: str, config: Optional[Dict[str, Any]] = None) -> None:
12
+ """Load or ensure model is loaded. Optional cache."""
13
+ ...
14
+
15
+ def run_inference(self, payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
16
+ """Run inference; payload is task input; config may include subtype, model, etc. Returns output dict."""
17
+ ...
18
+
19
+ def unload_model(self, model_name: str) -> None:
20
+ """Unload model to free memory."""
21
+ ...
@@ -0,0 +1,151 @@
1
+ """HuggingFace backend adapter: url_classification and emotion_classification."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import threading
7
+ from typing import Any, Dict, Optional
8
+
9
+ logger = logging.getLogger("topos.engine.huggingface")
10
+
11
+ # Default models (same as current website_classifier and emo_27_job)
12
+ DEFAULT_URL_CLASSIFICATION_MODEL = "KnutJaegersberg/website-classifier"
13
+ DEFAULT_EMOTION_MODEL = "SamLowe/roberta-base-go_emotions"
14
+
15
+
16
+ class HuggingFaceAdapter:
17
+ """BackendAdapter for HuggingFace: text-classification pipeline and go_emotions model."""
18
+
19
+ def __init__(self) -> None:
20
+ self._url_pipeline: Any = None
21
+ self._url_lock = threading.Lock()
22
+ self._emotion_model: Any = None
23
+ self._emotion_tokenizer: Any = None
24
+ self._emotion_loaded = False
25
+ self._emotion_lock = threading.Lock()
26
+
27
+ def load_model(self, model_name: str, config: Optional[Dict[str, Any]] = None) -> None:
28
+ """Load model by name; we load on first run_inference per subtype instead."""
29
+ pass
30
+
31
+ def ensure_model(self, model_name: str, subtype: Optional[str] = None) -> bool:
32
+ """
33
+ Ensure the model is downloaded (e.g. from HuggingFace Hub). Downloads if not present.
34
+ Returns True if a download was triggered (caller may clean up cache later), False if already in cache.
35
+ Logs when download starts; Hub may show progress via tqdm if enabled.
36
+ """
37
+ try:
38
+ from huggingface_hub import snapshot_download
39
+ except ImportError:
40
+ return False
41
+ logger.info("Downloading model %s (huggingface)...", model_name)
42
+ try:
43
+ # tqdm_enabled=True lets HuggingFace show a progress bar when available
44
+ snapshot_download(repo_id=model_name, tqdm_enabled=True)
45
+ except Exception:
46
+ logger.exception("Failed to download model %s", model_name)
47
+ return False
48
+ logger.info("Model %s (huggingface) download complete.", model_name)
49
+ return True
50
+
51
+ def _get_url_pipeline(self, model_name: str):
52
+ with self._url_lock:
53
+ if self._url_pipeline is not None:
54
+ return self._url_pipeline
55
+ from transformers import pipeline
56
+ self._url_pipeline = pipeline(
57
+ task="text-classification",
58
+ model=model_name,
59
+ )
60
+ return self._url_pipeline
61
+
62
+ def _get_emotion_model(self, model_name: str):
63
+ with self._emotion_lock:
64
+ if self._emotion_loaded:
65
+ return self._emotion_model, self._emotion_tokenizer
66
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
67
+ import torch
68
+ self._emotion_tokenizer = AutoTokenizer.from_pretrained(model_name)
69
+ self._emotion_model = AutoModelForSequenceClassification.from_pretrained(model_name)
70
+ self._emotion_model.eval()
71
+ self._emotion_loaded = True
72
+ return self._emotion_model, self._emotion_tokenizer
73
+
74
+ def run_inference(self, payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
75
+ """Dispatch by config.subtype to url_classification or emotion_classification."""
76
+ config = config or {}
77
+ subtype = config.get("subtype") or ""
78
+ model = config.get("model") or ""
79
+
80
+ if subtype == "url_classification":
81
+ return self._run_url_classification(payload, model)
82
+ if subtype in ("emotion_classification", "emo_27"):
83
+ return self._run_emotion_classification(payload, model)
84
+ # Unknown subtype: return error-like output so formatter can set status
85
+ return {"error": f"Unknown subtype: {subtype}", "status": "unsupported"}
86
+
87
+ def _run_url_classification(self, payload: Dict[str, Any], model_name: str) -> Dict[str, Any]:
88
+ """Same behavior as WebsiteUrlClassifier."""
89
+ url = payload.get("url") or ""
90
+ title = payload.get("title") or ""
91
+ if not isinstance(url, str) or not url.strip():
92
+ return {"error": "url must be a non-empty string", "category": "unknown", "confidence": 0.0, "model": model_name or DEFAULT_URL_CLASSIFICATION_MODEL}
93
+ model = model_name or DEFAULT_URL_CLASSIFICATION_MODEL
94
+ pipeline = self._get_url_pipeline(model)
95
+ clean_url = url.strip()
96
+ clean_title = (title or "").strip()
97
+ text = f"{clean_url} [SEP] {clean_title}" if clean_title else clean_url
98
+ result = pipeline(text, truncation=True, top_k=1)
99
+ top_result = result[0] if isinstance(result, list) and result else {}
100
+ return {
101
+ "category": top_result.get("label", "unknown"),
102
+ "confidence": float(top_result.get("score", 0.0) or 0.0),
103
+ "model": model,
104
+ }
105
+
106
+ def _run_emotion_classification(self, payload: Dict[str, Any], model_name: str) -> Dict[str, Any]:
107
+ """Same behavior as Emo27Job._classify_emotion."""
108
+ text = payload.get("text") or payload.get("content") or ""
109
+ if not text or not isinstance(text, str):
110
+ return {"error": "text or content required", "emotion_label": None, "confidence": None, "all_emotions": [], "model": model_name or DEFAULT_EMOTION_MODEL}
111
+ model = model_name or DEFAULT_EMOTION_MODEL
112
+ import torch
113
+ emo_model, tokenizer = self._get_emotion_model(model)
114
+ inputs = tokenizer(
115
+ text,
116
+ return_tensors="pt",
117
+ truncation=True,
118
+ max_length=512,
119
+ padding=True,
120
+ )
121
+ with torch.no_grad():
122
+ outputs = emo_model(**inputs)
123
+ probabilities = torch.nn.functional.softmax(outputs.logits[0], dim=-1)
124
+ labels = getattr(emo_model.config, "id2label", {}) or {}
125
+ top_k = min(5, len(labels))
126
+ top_probs, top_indices = torch.topk(probabilities, top_k)
127
+ all_emotions = []
128
+ for prob, idx in zip(top_probs, top_indices):
129
+ label_id = idx.item()
130
+ label = labels.get(label_id, f"label_{label_id}")
131
+ confidence = prob.item()
132
+ if confidence > 0.1:
133
+ all_emotions.append({"label": label, "confidence": float(confidence)})
134
+ top = all_emotions[0] if all_emotions else None
135
+ return {
136
+ "emotion_label": top["label"] if top else None,
137
+ "confidence": top["confidence"] if top else None,
138
+ "all_emotions": all_emotions,
139
+ "model": model,
140
+ }
141
+
142
+ def unload_model(self, model_name: str) -> None:
143
+ """Clear cached pipeline/model (simplified: clear if name matches)."""
144
+ if model_name == DEFAULT_URL_CLASSIFICATION_MODEL or "website-classifier" in (model_name or ""):
145
+ with self._url_lock:
146
+ self._url_pipeline = None
147
+ if model_name == DEFAULT_EMOTION_MODEL or "go_emotions" in (model_name or ""):
148
+ with self._emotion_lock:
149
+ self._emotion_model = None
150
+ self._emotion_tokenizer = None
151
+ self._emotion_loaded = False
@@ -0,0 +1,181 @@
1
+ """Ollama backend adapter: HTTP API for local LLM inference."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import urllib.request
8
+ import urllib.error
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ logger = logging.getLogger("topos.engine.ollama")
12
+
13
+
14
+ class OllamaAdapter:
15
+ """BackendAdapter for Ollama (http://localhost:11434)."""
16
+
17
+ def __init__(self, base_url: Optional[str] = None) -> None:
18
+ if base_url is None:
19
+ try:
20
+ from ...config.settings import settings
21
+ base_url = getattr(settings, "engine_ollama_base_url", None) or "http://localhost:11434"
22
+ except Exception:
23
+ base_url = "http://localhost:11434"
24
+ self._base_url = str(base_url).rstrip("/")
25
+
26
+ def list_models(self) -> List[str]:
27
+ """Return list of model names available on the server (from /api/tags)."""
28
+ req = urllib.request.Request(f"{self._base_url}/api/tags", method="GET")
29
+ try:
30
+ with urllib.request.urlopen(req, timeout=10) as resp:
31
+ data = json.loads(resp.read().decode())
32
+ return [m.get("name", "") for m in data.get("models", []) if m.get("name")]
33
+ except Exception:
34
+ return []
35
+
36
+ def pull_model(self, model_name: str, *, stream: bool = True) -> None:
37
+ """Download the model from the registry. Logs progress when stream=True. Raises on failure."""
38
+ body = {"model": model_name, "stream": stream}
39
+ req = urllib.request.Request(
40
+ f"{self._base_url}/api/pull",
41
+ data=json.dumps(body).encode("utf-8"),
42
+ headers={"Content-Type": "application/json"},
43
+ method="POST",
44
+ )
45
+ logger.info("Downloading model %s (ollama)...", model_name)
46
+ with urllib.request.urlopen(req, timeout=3600) as resp:
47
+ if stream:
48
+ last_pct = -1
49
+ for line in resp:
50
+ if not line.strip():
51
+ continue
52
+ try:
53
+ event = json.loads(line.decode())
54
+ except json.JSONDecodeError:
55
+ continue
56
+ status = event.get("status", "")
57
+ total = event.get("total") or 0
58
+ completed = event.get("completed") or 0
59
+ if total and total > 0 and completed >= 0:
60
+ pct = min(100, int(100 * completed / total))
61
+ if pct != last_pct and (pct % 10 == 0 or pct == 100):
62
+ last_pct = pct
63
+ total_mb = total / (1024 * 1024)
64
+ done_mb = completed / (1024 * 1024)
65
+ bar_len = 10
66
+ filled = int(bar_len * pct / 100) if pct < 100 else bar_len
67
+ bar = "=" * filled + ">" * (1 if filled < bar_len and pct > 0 else 0) + " " * (bar_len - filled - (1 if filled < bar_len and pct > 0 else 0))
68
+ logger.info(
69
+ "Pulling model %s: [%s] %d%% (%.1f / %.1f MB)",
70
+ model_name, bar[:bar_len], pct, done_mb, total_mb,
71
+ )
72
+ elif status:
73
+ logger.debug("Pulling model %s: %s", model_name, status)
74
+ logger.info("Model %s (ollama) pull complete.", model_name)
75
+ else:
76
+ json.loads(resp.read().decode())
77
+ logger.info("Model %s (ollama) pull complete.", model_name)
78
+
79
+ def delete_model(self, model_name: str) -> None:
80
+ """Remove the model from the server. Raises on failure."""
81
+ req = urllib.request.Request(
82
+ f"{self._base_url}/api/delete",
83
+ data=json.dumps({"model": model_name}).encode("utf-8"),
84
+ headers={"Content-Type": "application/json"},
85
+ method="DELETE",
86
+ )
87
+ try:
88
+ with urllib.request.urlopen(req, timeout=30) as resp:
89
+ resp.read()
90
+ except urllib.error.HTTPError as e:
91
+ if e.code != 404:
92
+ raise
93
+
94
+ def ensure_model(self, model_name: str) -> bool:
95
+ """
96
+ Ensure the model is available: pull if not present.
97
+ Returns True if we pulled the model (caller may want to remove it later), False if already present.
98
+ Logs download start and progress (when streaming).
99
+ """
100
+ names = self.list_models()
101
+ for n in names:
102
+ if n == model_name or model_name in n or (model_name.split(":")[0] == n.split(":")[0] if ":" in n else n == model_name.split(":")[0]):
103
+ return False
104
+ self.pull_model(model_name, stream=True)
105
+ return True
106
+
107
+ def load_model(self, model_name: str, config: Optional[Dict[str, Any]] = None) -> None:
108
+ """Load model into memory by running a minimal generate. Idempotent if already loaded."""
109
+ self._generate(model_name, " ", num_predict=1, keep_alive=None)
110
+
111
+ def run_inference(self, payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
112
+ """Call Ollama /api/generate; map payload to prompt and parse response."""
113
+ config = config or {}
114
+ subtype = config.get("subtype") or ""
115
+ model = config.get("model") or "llama3.2:3b"
116
+ text = payload.get("text") or payload.get("content") or payload.get("url") or ""
117
+ try:
118
+ if subtype == "emotion_classification" or subtype == "emo_27":
119
+ prompt = (
120
+ f'Classify the emotion of this text in one word or short phrase. '
121
+ f'Reply with JSON only: {{"emotion_label": "...", "confidence": 0.9}}\n\nText: {text}'
122
+ )
123
+ else:
124
+ prompt = str(payload) if payload else ""
125
+ response_text = self._generate(model, prompt, num_predict=None, keep_alive=None)
126
+ out = self._parse_response(response_text, subtype, model)
127
+ out["model"] = model
128
+ return out
129
+ except Exception as e:
130
+ return {"error": str(e), "model": model, "emotion_label": None, "confidence": None, "all_emotions": []}
131
+
132
+ def _generate(
133
+ self,
134
+ model: str,
135
+ prompt: str,
136
+ *,
137
+ num_predict: Optional[int] = None,
138
+ keep_alive: Optional[str] = None,
139
+ ) -> str:
140
+ body: Dict[str, Any] = {"model": model, "prompt": prompt, "stream": False}
141
+ if keep_alive is not None:
142
+ body["keep_alive"] = keep_alive
143
+ if num_predict is not None:
144
+ body["options"] = {"num_predict": num_predict}
145
+ req = urllib.request.Request(
146
+ f"{self._base_url}/api/generate",
147
+ data=json.dumps(body).encode("utf-8"),
148
+ headers={"Content-Type": "application/json"},
149
+ method="POST",
150
+ )
151
+ try:
152
+ with urllib.request.urlopen(req, timeout=60) as resp:
153
+ data = json.loads(resp.read().decode())
154
+ return data.get("response", "")
155
+ except urllib.error.URLError as e:
156
+ raise RuntimeError(f"Ollama request failed: {e}") from e
157
+
158
+ def _parse_response(self, response_text: str, subtype: str, model: str) -> Dict[str, Any]:
159
+ """Try to parse JSON from response; else return raw."""
160
+ response_text = (response_text or "").strip()
161
+ if subtype in ("emotion_classification", "emo_27"):
162
+ try:
163
+ # Try to find JSON in the response
164
+ start = response_text.find("{")
165
+ if start >= 0:
166
+ end = response_text.rfind("}") + 1
167
+ if end > start:
168
+ obj = json.loads(response_text[start:end])
169
+ return {
170
+ "emotion_label": obj.get("emotion_label"),
171
+ "confidence": obj.get("confidence"),
172
+ "all_emotions": [{"label": obj.get("emotion_label"), "confidence": obj.get("confidence", 0)}],
173
+ }
174
+ except (json.JSONDecodeError, KeyError):
175
+ pass
176
+ return {"emotion_label": response_text[:100] if response_text else None, "confidence": None, "all_emotions": []}
177
+ return {"output": response_text}
178
+
179
+ def unload_model(self, model_name: str) -> None:
180
+ """Unload model from memory by sending a minimal generate with keep_alive=0."""
181
+ self._generate(model_name, " ", num_predict=1, keep_alive="0")
@@ -0,0 +1,22 @@
1
+ """Stub backend adapter for when no real backend is configured."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+
8
+ class StubBackendAdapter:
9
+ """Stub adapter: no real inference, returns fixed dict."""
10
+
11
+ def load_model(self, model_name: str, config: Optional[Dict[str, Any]] = None) -> None:
12
+ pass
13
+
14
+ def run_inference(self, payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
15
+ return {"status": "stub", "message": "No backend configured; use Sprint 02+ for real inference"}
16
+
17
+ def unload_model(self, model_name: str) -> None:
18
+ pass
19
+
20
+
21
+ def get_stub_adapter() -> StubBackendAdapter:
22
+ return StubBackendAdapter()