topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
v1 LLM-backed field transforms via Ollama (e.g. llama3.2).
|
|
3
|
+
|
|
4
|
+
Configuration (precedence: device DB overrides → Settings / env → safe defaults):
|
|
5
|
+
- File/env: see `topos.config.settings` (SANITIZATION_OLLAMA_*).
|
|
6
|
+
- Device: `engine_config` key `sanitization_ollama_device` (JSON), via PUT /v1/sanitization-ollama-config.
|
|
7
|
+
|
|
8
|
+
If /api/chat returns 404 with a model-not-found error and SANITIZATION_OLLAMA_AUTO_PULL is true (default),
|
|
9
|
+
the engine pulls the model via Ollama /api/pull (see OllamaAdapter.ensure_model) then retries chat once.
|
|
10
|
+
|
|
11
|
+
Fail-open: on error, callers keep the original text.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
import threading
|
|
20
|
+
from contextlib import contextmanager
|
|
21
|
+
from typing import Any, Dict, Final, Iterator, List, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
from topos.config.sanitization_ollama import (
|
|
24
|
+
SANITIZATION_OLLAMA_TRANSFORM_IDS,
|
|
25
|
+
SanitizationOllamaEffective,
|
|
26
|
+
resolve_sanitization_ollama_effective,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("topos.sanitization.ollama")
|
|
30
|
+
|
|
31
|
+
# Serialize Ollama pulls when many UMA rows hit a missing model at once.
|
|
32
|
+
_pull_locks: Dict[Tuple[str, str], threading.Lock] = {}
|
|
33
|
+
_pull_registry_lock = threading.Lock()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@contextmanager
|
|
37
|
+
def _serialized_ollama_pull(host: str, model: str) -> Iterator[None]:
|
|
38
|
+
key = (host.rstrip("/"), model)
|
|
39
|
+
with _pull_registry_lock:
|
|
40
|
+
if key not in _pull_locks:
|
|
41
|
+
_pull_locks[key] = threading.Lock()
|
|
42
|
+
lock = _pull_locks[key]
|
|
43
|
+
with lock:
|
|
44
|
+
yield
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _response_suggests_missing_ollama_model(response: Any) -> bool:
|
|
48
|
+
"""Heuristic: Ollama /api/chat 404 JSON error for an unknown local model tag."""
|
|
49
|
+
if getattr(response, "status_code", None) != 404:
|
|
50
|
+
return False
|
|
51
|
+
err = ""
|
|
52
|
+
try:
|
|
53
|
+
data = response.json()
|
|
54
|
+
if isinstance(data, dict):
|
|
55
|
+
err = str(data.get("error") or "")
|
|
56
|
+
except Exception:
|
|
57
|
+
err = str(getattr(response, "text", None) or "")
|
|
58
|
+
el = err.lower()
|
|
59
|
+
return "not found" in el and "model" in el
|
|
60
|
+
|
|
61
|
+
# Backward-compatible alias
|
|
62
|
+
OLLAMA_TRANSFORM_IDS: Final[tuple[str, ...]] = SANITIZATION_OLLAMA_TRANSFORM_IDS
|
|
63
|
+
|
|
64
|
+
_SYSTEM_STRICT: Final[str] = (
|
|
65
|
+
"You are a precise text processor for a privacy-preserving data export pipeline. "
|
|
66
|
+
"Follow instructions exactly. Output ONLY the requested result with no markdown fences, "
|
|
67
|
+
"no preamble, and no explanation unless the format explicitly asks for structured fields."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _user_pii_redaction(text: str) -> str:
|
|
72
|
+
return f"""Redact personally identifiable information in the following text.
|
|
73
|
+
Replace each span of PII with a bracketed token: [NAME], [EMAIL], [PHONE], [ADDRESS], [ID], [URL], [DATE_OF_BIRTH], [ACCOUNT], [OTHER_PII].
|
|
74
|
+
Preserve structure (paragraphs, newlines) and non-PII wording. Do not invent content.
|
|
75
|
+
|
|
76
|
+
TEXT:
|
|
77
|
+
---
|
|
78
|
+
{text}
|
|
79
|
+
---
|
|
80
|
+
OUTPUT (redacted text only):"""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _user_nsfw_sanitization(text: str) -> str:
|
|
84
|
+
return f"""Sanitize the following text for a general audience: remove or replace sexually explicit, graphically violent, or illegal-content descriptions with [REMOVED].
|
|
85
|
+
Keep the rest of the message meaning where possible. Do not add commentary.
|
|
86
|
+
|
|
87
|
+
TEXT:
|
|
88
|
+
---
|
|
89
|
+
{text}
|
|
90
|
+
---
|
|
91
|
+
OUTPUT (sanitized text only):"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _user_raw_to_summary(text: str, params: Dict[str, Any]) -> str:
|
|
95
|
+
style = str(params.get("style") or "neutral").strip() or "neutral"
|
|
96
|
+
max_len = params.get("max_length")
|
|
97
|
+
cap = ""
|
|
98
|
+
if isinstance(max_len, int) and max_len > 0:
|
|
99
|
+
cap = f" Maximum length: about {max_len} words."
|
|
100
|
+
return f"""Summarize the following text in a {style} tone.{cap}
|
|
101
|
+
Output a single concise paragraph.
|
|
102
|
+
|
|
103
|
+
TEXT:
|
|
104
|
+
---
|
|
105
|
+
{text}
|
|
106
|
+
---
|
|
107
|
+
OUTPUT (summary only):"""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _user_raw_to_sentiment(text: str, params: Dict[str, Any]) -> str:
|
|
111
|
+
scale = str(params.get("scale") or "ternary").strip() or "ternary"
|
|
112
|
+
return f"""Classify the overall sentiment of the text on scale "{scale}".
|
|
113
|
+
Respond with EXACTLY one line of JSON (no markdown) with keys: "label" (string) and "confidence" (number 0-1).
|
|
114
|
+
Example: {{"label":"positive","confidence":0.71}}
|
|
115
|
+
|
|
116
|
+
TEXT:
|
|
117
|
+
---
|
|
118
|
+
{text}
|
|
119
|
+
---
|
|
120
|
+
OUTPUT (one JSON line only):"""
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _user_third_party_anonymization(text: str, params: Dict[str, Any]) -> str:
|
|
124
|
+
mode = str(params.get("mode") or "replace").strip() or "replace"
|
|
125
|
+
return f"""Anonymize third-party identities in the text (other people, companies, or clients mentioned by name).
|
|
126
|
+
Mode: {mode}. Replace identifiable third-party names with [PERSON_1], [ORG_1], etc. Keep the narrator's own voice and first-person references unchanged if clearly "I/me/my".
|
|
127
|
+
Do not remove non-identifying roles ("my therapist", "a colleague") unless they include a name.
|
|
128
|
+
|
|
129
|
+
TEXT:
|
|
130
|
+
---
|
|
131
|
+
{text}
|
|
132
|
+
---
|
|
133
|
+
OUTPUT (anonymized text only):"""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _user_name_removal(text: str) -> str:
|
|
137
|
+
return f"""Remove personal names (people and well-known public figures) from the text. Replace each removed name with [NAME].
|
|
138
|
+
Keep dates, numbers, and non-name entities. Preserve readability.
|
|
139
|
+
|
|
140
|
+
TEXT:
|
|
141
|
+
---
|
|
142
|
+
{text}
|
|
143
|
+
---
|
|
144
|
+
OUTPUT (text only):"""
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _user_contact_removal(text: str) -> str:
|
|
148
|
+
return f"""Remove contact details: email addresses, phone numbers, street addresses, social handles (@user), and messaging IDs.
|
|
149
|
+
Replace each removed span with [CONTACT]. Keep the rest of the message.
|
|
150
|
+
|
|
151
|
+
TEXT:
|
|
152
|
+
---
|
|
153
|
+
{text}
|
|
154
|
+
---
|
|
155
|
+
OUTPUT (text only):"""
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _build_messages(transform_id: str, text: str, params: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
159
|
+
params = params or {}
|
|
160
|
+
builders = {
|
|
161
|
+
"pii_redaction": lambda: _user_pii_redaction(text),
|
|
162
|
+
"nsfw_sanitization": lambda: _user_nsfw_sanitization(text),
|
|
163
|
+
"raw_to_summary": lambda: _user_raw_to_summary(text, params),
|
|
164
|
+
"raw_to_sentiment": lambda: _user_raw_to_sentiment(text, params),
|
|
165
|
+
"third_party_anonymization": lambda: _user_third_party_anonymization(text, params),
|
|
166
|
+
"name_removal": lambda: _user_name_removal(text),
|
|
167
|
+
"contact_removal": lambda: _user_contact_removal(text),
|
|
168
|
+
}
|
|
169
|
+
fn = builders.get(transform_id)
|
|
170
|
+
if fn is None:
|
|
171
|
+
raise ValueError(f"No Ollama prompt builder for transform_id={transform_id!r}")
|
|
172
|
+
user_content = fn()
|
|
173
|
+
return [
|
|
174
|
+
{"role": "system", "content": _SYSTEM_STRICT},
|
|
175
|
+
{"role": "user", "content": user_content},
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def ollama_sanitization_enabled() -> bool:
|
|
180
|
+
"""Whether sanitization is enabled (Settings + device overrides)."""
|
|
181
|
+
from topos.config.settings import settings
|
|
182
|
+
from topos.core.state import get_db_connection
|
|
183
|
+
|
|
184
|
+
return resolve_sanitization_ollama_effective(settings, get_db_connection()).enabled
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _ollama_chat(
|
|
188
|
+
messages: List[Dict[str, str]],
|
|
189
|
+
*,
|
|
190
|
+
host: str,
|
|
191
|
+
model: str,
|
|
192
|
+
timeout_sec: float,
|
|
193
|
+
auto_pull: bool = True,
|
|
194
|
+
) -> str:
|
|
195
|
+
import httpx
|
|
196
|
+
|
|
197
|
+
url = f"{host.rstrip('/')}/api/chat"
|
|
198
|
+
body = {"model": model, "messages": messages, "stream": False}
|
|
199
|
+
with httpx.Client(timeout=timeout_sec) as client:
|
|
200
|
+
resp = client.post(url, json=body)
|
|
201
|
+
if (
|
|
202
|
+
auto_pull
|
|
203
|
+
and resp.status_code == 404
|
|
204
|
+
and _response_suggests_missing_ollama_model(resp)
|
|
205
|
+
):
|
|
206
|
+
logger.info(
|
|
207
|
+
"Ollama model %r not available at %s; pulling via /api/pull then retrying /api/chat",
|
|
208
|
+
model,
|
|
209
|
+
host.rstrip("/"),
|
|
210
|
+
)
|
|
211
|
+
with _serialized_ollama_pull(host, model):
|
|
212
|
+
from topos.engine.backends.ollama import OllamaAdapter
|
|
213
|
+
|
|
214
|
+
pulled = OllamaAdapter(base_url=host).ensure_model(model)
|
|
215
|
+
if pulled:
|
|
216
|
+
logger.info("Sanitization: finished pulling Ollama model %r", model)
|
|
217
|
+
resp = client.post(url, json=body)
|
|
218
|
+
resp.raise_for_status()
|
|
219
|
+
data = resp.json()
|
|
220
|
+
msg = data.get("message") or {}
|
|
221
|
+
content = msg.get("content")
|
|
222
|
+
if not isinstance(content, str):
|
|
223
|
+
raise ValueError("Ollama response missing message.content")
|
|
224
|
+
return content.strip()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _strip_fences(s: str) -> str:
|
|
228
|
+
s = s.strip()
|
|
229
|
+
m = re.match(r"^```(?:\w+)?\s*\n?(.*?)\n?```\s*$", s, re.DOTALL)
|
|
230
|
+
return m.group(1).strip() if m else s
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def apply_text_transform_with_ollama(
|
|
234
|
+
text: str,
|
|
235
|
+
transform_id: str,
|
|
236
|
+
params: Optional[Dict[str, Any]] = None,
|
|
237
|
+
*,
|
|
238
|
+
effective: SanitizationOllamaEffective,
|
|
239
|
+
model_override: Optional[str] = None,
|
|
240
|
+
) -> str:
|
|
241
|
+
"""
|
|
242
|
+
Run a single catalog transform on string `text` via Ollama.
|
|
243
|
+
`effective` should be from `resolve_sanitization_ollama_effective` (caller resolves once per batch).
|
|
244
|
+
`model_override`: when set (e.g. Filter Lab compare), use this Ollama tag instead of per-transform model.
|
|
245
|
+
"""
|
|
246
|
+
if transform_id not in SANITIZATION_OLLAMA_TRANSFORM_IDS:
|
|
247
|
+
raise ValueError(f"transform_id {transform_id!r} is not handled by Ollama sanitization")
|
|
248
|
+
|
|
249
|
+
model = (model_override or "").strip() or effective.models.get(transform_id) or effective.default_model
|
|
250
|
+
max_in = effective.max_input_chars
|
|
251
|
+
if max_in > 0 and len(text) > max_in:
|
|
252
|
+
text = text[:max_in] + "\n[TRUNCATED_FOR_LLM]"
|
|
253
|
+
|
|
254
|
+
messages = _build_messages(transform_id, text, dict(params or {}))
|
|
255
|
+
raw = _ollama_chat(
|
|
256
|
+
messages,
|
|
257
|
+
host=effective.host,
|
|
258
|
+
model=model,
|
|
259
|
+
timeout_sec=effective.timeout_sec,
|
|
260
|
+
auto_pull=effective.auto_pull,
|
|
261
|
+
)
|
|
262
|
+
out = _strip_fences(raw)
|
|
263
|
+
|
|
264
|
+
if transform_id == "raw_to_sentiment":
|
|
265
|
+
try:
|
|
266
|
+
line = out.splitlines()[0].strip()
|
|
267
|
+
parsed = json.loads(line)
|
|
268
|
+
label = str(parsed.get("label", "unknown"))
|
|
269
|
+
conf = parsed.get("confidence")
|
|
270
|
+
if isinstance(conf, (int, float)):
|
|
271
|
+
return f"{label} ({float(conf):.2f})"
|
|
272
|
+
return label
|
|
273
|
+
except Exception:
|
|
274
|
+
return out[:500]
|
|
275
|
+
|
|
276
|
+
return out
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Scope → canonical table resolution (Sprint 05 Stage 1)
|
|
2
|
+
# Maps MVP scope IDs from RPT to canonical table names. Reference: roles_scopes/MVP_TAXONOMY.md §4
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import List, Optional, Set
|
|
7
|
+
|
|
8
|
+
# Scope ID -> canonical table(s). One scope can map to one or more tables.
|
|
9
|
+
# Table names match MVP_TAXONOMY §4 (messages, ai_messages, events, ai_chat, activity, journal, Profile).
|
|
10
|
+
SCOPE_TO_TABLES: dict[str, list[str]] = {
|
|
11
|
+
"messages:read": ["messages"],
|
|
12
|
+
"messages:write": ["messages"],
|
|
13
|
+
"aiMessages:read": ["ai_messages"],
|
|
14
|
+
"events:read": ["events"],
|
|
15
|
+
"events:write": ["events"],
|
|
16
|
+
"aiChat:read": ["ai_chat"],
|
|
17
|
+
"activity:read": ["activity"],
|
|
18
|
+
"activity:write": ["activity"],
|
|
19
|
+
"journal:read": ["journal"],
|
|
20
|
+
"journal:write": ["journal"],
|
|
21
|
+
"activitySummary:read": ["activity_summary"],
|
|
22
|
+
"wellnessSummary:read": ["wellness_summary"],
|
|
23
|
+
"publicBio:read": ["public_bio"],
|
|
24
|
+
"contacts:resolve": ["contacts", "contact_identifiers"],
|
|
25
|
+
"all:read": ["*"],
|
|
26
|
+
"all:write": ["*"],
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# All canonical table names (for "all" resolution). Order doesn't matter.
|
|
30
|
+
ALL_CANONICAL_TABLES: set[str] = {
|
|
31
|
+
"messages",
|
|
32
|
+
"ai_messages",
|
|
33
|
+
"events",
|
|
34
|
+
"ai_chat",
|
|
35
|
+
"activity",
|
|
36
|
+
"journal",
|
|
37
|
+
"activity_summary",
|
|
38
|
+
"wellness_summary",
|
|
39
|
+
"public_bio",
|
|
40
|
+
"contacts",
|
|
41
|
+
"contact_identifiers",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def resolve_scopes_to_tables(scope_ids: Optional[List[str]]) -> Set[str]:
|
|
46
|
+
"""
|
|
47
|
+
Map MVP scope IDs to canonical table names.
|
|
48
|
+
If all:read or all:write is in scope_ids, returns ALL_CANONICAL_TABLES.
|
|
49
|
+
Otherwise returns the union of tables for each scope in scope_ids.
|
|
50
|
+
"""
|
|
51
|
+
if not scope_ids:
|
|
52
|
+
return set()
|
|
53
|
+
tables: Set[str] = set()
|
|
54
|
+
for s in scope_ids:
|
|
55
|
+
s = (s or "").strip()
|
|
56
|
+
if not s:
|
|
57
|
+
continue
|
|
58
|
+
if s in ("all:read", "all:write"):
|
|
59
|
+
return set(ALL_CANONICAL_TABLES)
|
|
60
|
+
if s in SCOPE_TO_TABLES:
|
|
61
|
+
mapped = SCOPE_TO_TABLES[s]
|
|
62
|
+
if "*" in mapped:
|
|
63
|
+
return set(ALL_CANONICAL_TABLES)
|
|
64
|
+
tables.update(mapped)
|
|
65
|
+
return tables
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def may_access_table(allowed_tables: Set[str], table_or_alias: str) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
True if the allowed set (from resolve_scopes_to_tables) permits access to the given table.
|
|
71
|
+
table_or_alias can be canonical name (e.g. messages) or implementation table (e.g. ai_chat_messages).
|
|
72
|
+
"""
|
|
73
|
+
if not allowed_tables:
|
|
74
|
+
return False
|
|
75
|
+
if "*" in allowed_tables or "all" in allowed_tables:
|
|
76
|
+
return True
|
|
77
|
+
# Exact match
|
|
78
|
+
if table_or_alias in allowed_tables:
|
|
79
|
+
return True
|
|
80
|
+
if table_or_alias == "conversation_messages" and "messages" in allowed_tables:
|
|
81
|
+
return True
|
|
82
|
+
# Implementation detail: ai_chat_messages table holds ai_chat and ai_messages data
|
|
83
|
+
if table_or_alias == "ai_chat_messages" and ("ai_chat" in allowed_tables or "ai_messages" in allowed_tables):
|
|
84
|
+
return True
|
|
85
|
+
if table_or_alias == "messages" and "messages" in allowed_tables:
|
|
86
|
+
return True
|
|
87
|
+
if table_or_alias in ("contacts", "contact_identifiers"):
|
|
88
|
+
return "contacts" in allowed_tables and "contact_identifiers" in allowed_tables
|
|
89
|
+
return False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Service interfaces and adapters for Topos."""
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from ..config.settings import settings
|
|
6
|
+
from .embeddings.base import EmbeddingsService
|
|
7
|
+
from .embeddings.local import LocalEmbeddingsService
|
|
8
|
+
from .embeddings.remote import RemoteEmbeddingsService
|
|
9
|
+
from .interfaces import DbService, DeviceService, LLMService, SyncService
|
|
10
|
+
from .local import LocalDbService, LocalDeviceService, LocalSyncService
|
|
11
|
+
from .llm.openai import OpenAILLMService
|
|
12
|
+
from .postgres import HostedDeviceService, HostedSyncService, PostgresDbService
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class Services:
|
|
17
|
+
db: DbService
|
|
18
|
+
sync: SyncService
|
|
19
|
+
device: DeviceService
|
|
20
|
+
llm: LLMService
|
|
21
|
+
embeddings: EmbeddingsService
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_services: Services | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_services() -> Services:
|
|
28
|
+
global _services
|
|
29
|
+
if _services is None:
|
|
30
|
+
if settings.topos_database_mode == "postgres":
|
|
31
|
+
_services = Services(
|
|
32
|
+
db=PostgresDbService(),
|
|
33
|
+
sync=HostedSyncService(),
|
|
34
|
+
device=HostedDeviceService(),
|
|
35
|
+
llm=OpenAILLMService(),
|
|
36
|
+
embeddings=RemoteEmbeddingsService(),
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
_services = Services(
|
|
40
|
+
db=LocalDbService(),
|
|
41
|
+
sync=LocalSyncService(),
|
|
42
|
+
device=LocalDeviceService(),
|
|
43
|
+
llm=OpenAILLMService(),
|
|
44
|
+
embeddings=LocalEmbeddingsService(),
|
|
45
|
+
)
|
|
46
|
+
return _services
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embeddings service adapters."""
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional, Protocol
|
|
4
|
+
|
|
5
|
+
from ..core.api_models import (
|
|
6
|
+
DeviceInfoResponse,
|
|
7
|
+
DeviceNameResponse,
|
|
8
|
+
PairDeviceResponse,
|
|
9
|
+
PairingCodeResponse,
|
|
10
|
+
StoreMessageResponse,
|
|
11
|
+
SyncDatabaseResponse,
|
|
12
|
+
SyncResponse,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DbService(Protocol):
|
|
17
|
+
async def store_message(self, payload: Dict[str, Any]) -> StoreMessageResponse: ...
|
|
18
|
+
async def get_oplog(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]: ...
|
|
19
|
+
async def get_messages(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]: ...
|
|
20
|
+
async def replay_projection(self, dataset_id: Optional[str]) -> Dict[str, Any]: ...
|
|
21
|
+
async def reset_database(self) -> Dict[str, Any]: ...
|
|
22
|
+
async def sync_database(self) -> SyncDatabaseResponse: ...
|
|
23
|
+
async def backup_database(self, encrypted: bool) -> Any: ...
|
|
24
|
+
async def restore_database(self, file, authenticated_user_id: str, encrypted: bool) -> Dict[str, Any]: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SyncService(Protocol):
|
|
28
|
+
async def trigger_sync(self) -> SyncResponse: ...
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DeviceService(Protocol):
|
|
32
|
+
async def get_pairing_code(self) -> PairingCodeResponse: ...
|
|
33
|
+
async def pair_device(self, pairing_code: str, keep_existing_data: bool) -> PairDeviceResponse: ...
|
|
34
|
+
async def get_device_info(self) -> DeviceInfoResponse: ...
|
|
35
|
+
async def set_device_name(self, device_name: str) -> DeviceNameResponse: ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LLMService(Protocol):
|
|
39
|
+
async def generate(self, payload: Dict[str, Any]) -> Dict[str, Any]: ...
|
|
40
|
+
async def list_ollama_models(self) -> Dict[str, Any]: ...
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""LLM service adapters."""
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from fastapi import HTTPException, status
|
|
8
|
+
|
|
9
|
+
from ...core import state
|
|
10
|
+
from ...openai_client import OpenAIError
|
|
11
|
+
from ...config.settings import settings
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("topos.services.llm")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _normalize_provider(raw: Any) -> str:
|
|
17
|
+
if raw is None:
|
|
18
|
+
return "openai"
|
|
19
|
+
if isinstance(raw, str):
|
|
20
|
+
v = raw.lower().strip()
|
|
21
|
+
if v in ("openai", "ollama"):
|
|
22
|
+
return v
|
|
23
|
+
return "openai"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def _ollama_generate(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
27
|
+
prompt = payload.get("prompt") or ""
|
|
28
|
+
model_raw = payload.get("model")
|
|
29
|
+
model = (
|
|
30
|
+
(model_raw.strip() if isinstance(model_raw, str) else "")
|
|
31
|
+
or settings.sanitization_ollama_default_model
|
|
32
|
+
)
|
|
33
|
+
max_tokens = payload.get("max_tokens")
|
|
34
|
+
temperature = payload.get("temperature")
|
|
35
|
+
base = settings.engine_ollama_base_url.rstrip("/")
|
|
36
|
+
body: Dict[str, Any] = {"model": model, "prompt": prompt, "stream": False}
|
|
37
|
+
opts: Dict[str, Any] = {}
|
|
38
|
+
if max_tokens is not None:
|
|
39
|
+
opts["num_predict"] = max_tokens
|
|
40
|
+
if temperature is not None:
|
|
41
|
+
opts["temperature"] = temperature
|
|
42
|
+
if opts:
|
|
43
|
+
body["options"] = opts
|
|
44
|
+
timeout = httpx.Timeout(settings.sanitization_ollama_timeout_sec, connect=10.0)
|
|
45
|
+
logger.info(
|
|
46
|
+
"Ollama generate: model=%r base=%s prompt_chars=%d max_tokens=%s temperature=%s",
|
|
47
|
+
model,
|
|
48
|
+
base,
|
|
49
|
+
len(prompt),
|
|
50
|
+
max_tokens,
|
|
51
|
+
temperature,
|
|
52
|
+
)
|
|
53
|
+
try:
|
|
54
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
55
|
+
r = await client.post(f"{base}/api/generate", json=body)
|
|
56
|
+
except httpx.RequestError as exc:
|
|
57
|
+
raise HTTPException(
|
|
58
|
+
status_code=status.HTTP_502_BAD_GATEWAY,
|
|
59
|
+
detail=f"Ollama unreachable at {base}: {exc}",
|
|
60
|
+
) from exc
|
|
61
|
+
if r.status_code >= 400:
|
|
62
|
+
detail = (r.text or str(r.status_code))[:800]
|
|
63
|
+
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=f"Ollama error: {detail}")
|
|
64
|
+
data = r.json()
|
|
65
|
+
text = (data.get("response") or "").strip()
|
|
66
|
+
resp_model = str(data.get("model") or model)
|
|
67
|
+
logger.info(
|
|
68
|
+
"Ollama generate complete: model=%r response_chars=%d",
|
|
69
|
+
resp_model,
|
|
70
|
+
len(text),
|
|
71
|
+
)
|
|
72
|
+
return {"output": text, "model": resp_model, "usage": {}}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def _ollama_list_model_names() -> list[str]:
|
|
76
|
+
base = settings.engine_ollama_base_url.rstrip("/")
|
|
77
|
+
timeout = httpx.Timeout(settings.sanitization_ollama_timeout_sec, connect=10.0)
|
|
78
|
+
logger.info("Ollama list models: base=%s", base)
|
|
79
|
+
try:
|
|
80
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
81
|
+
r = await client.get(f"{base}/api/tags")
|
|
82
|
+
except httpx.RequestError as exc:
|
|
83
|
+
raise HTTPException(
|
|
84
|
+
status_code=status.HTTP_502_BAD_GATEWAY,
|
|
85
|
+
detail=f"Ollama unreachable at {base}: {exc}",
|
|
86
|
+
) from exc
|
|
87
|
+
if r.status_code >= 400:
|
|
88
|
+
detail = (r.text or str(r.status_code))[:800]
|
|
89
|
+
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=f"Ollama error: {detail}")
|
|
90
|
+
data = r.json()
|
|
91
|
+
names = [str(m.get("name", "")).strip() for m in data.get("models", []) if m.get("name")]
|
|
92
|
+
unique = sorted(set(names))
|
|
93
|
+
logger.info("Ollama list models complete: count=%d", len(unique))
|
|
94
|
+
return unique
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class OpenAILLMService:
|
|
98
|
+
async def list_ollama_models(self) -> Dict[str, Any]:
|
|
99
|
+
if not settings.enable_llm or state.get_engine_mode() != "full":
|
|
100
|
+
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="LLM is disabled")
|
|
101
|
+
models = await _ollama_list_model_names()
|
|
102
|
+
return {"models": models}
|
|
103
|
+
|
|
104
|
+
async def generate(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
105
|
+
if not settings.enable_llm or state.get_engine_mode() != "full":
|
|
106
|
+
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="LLM is disabled")
|
|
107
|
+
|
|
108
|
+
provider = _normalize_provider(payload.get("provider"))
|
|
109
|
+
if provider == "ollama":
|
|
110
|
+
logger.info("LLM generate routed to Ollama (model=%r)", payload.get("model"))
|
|
111
|
+
return await _ollama_generate(payload)
|
|
112
|
+
|
|
113
|
+
if not settings.openai_api_key:
|
|
114
|
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="OPENAI_API_KEY is not set")
|
|
115
|
+
try:
|
|
116
|
+
result = await state.openai_client.generate(
|
|
117
|
+
prompt=payload.get("prompt", ""),
|
|
118
|
+
max_tokens=payload.get("max_tokens"),
|
|
119
|
+
temperature=payload.get("temperature"),
|
|
120
|
+
)
|
|
121
|
+
return {"output": result["output"], "model": settings.openai_model, "usage": result["usage"]}
|
|
122
|
+
except OpenAIError as exc:
|
|
123
|
+
detail = str(exc)
|
|
124
|
+
if "rate_limited" in detail:
|
|
125
|
+
raise HTTPException(status_code=status.HTTP_429_TOO_MANY_REQUESTS, detail="LLM rate limited") from exc
|
|
126
|
+
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="LLM upstream error") from exc
|