topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Background processing for Filter Lab job groups (serial runs, Ollama pull/cleanup)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Dict, List, Set
|
|
10
|
+
|
|
11
|
+
from topos.config.sanitization_ollama import resolve_sanitization_ollama_effective
|
|
12
|
+
from topos.config.settings import settings
|
|
13
|
+
from topos.core.state import get_db_connection
|
|
14
|
+
from topos.engine.backends.ollama import OllamaAdapter
|
|
15
|
+
from topos.sanitization.ollama_transforms import apply_text_transform_with_ollama
|
|
16
|
+
|
|
17
|
+
from . import bundles as bundles_mod
|
|
18
|
+
from . import store
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("topos.filter_lab.worker")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _protection_tags(eff: Any) -> Set[str]:
|
|
24
|
+
s = {eff.default_model}
|
|
25
|
+
s.update(v for v in eff.models.values() if v)
|
|
26
|
+
return {str(x).strip() for x in s if x and str(x).strip()}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _input_hash(text: str) -> str:
|
|
30
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:24]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _ensure_model_pulled(
|
|
34
|
+
adapter: OllamaAdapter,
|
|
35
|
+
model_tag: str,
|
|
36
|
+
baseline: Set[str],
|
|
37
|
+
pulled: List[str],
|
|
38
|
+
conn: Any,
|
|
39
|
+
group_id: str,
|
|
40
|
+
) -> None:
|
|
41
|
+
if model_tag in baseline:
|
|
42
|
+
return
|
|
43
|
+
if model_tag in pulled:
|
|
44
|
+
return
|
|
45
|
+
adapter.pull_model(model_tag)
|
|
46
|
+
pulled.append(model_tag)
|
|
47
|
+
store.set_group_pulled_models(conn, group_id, list(pulled))
|
|
48
|
+
store.insert_model_event(conn, group_id, "pull", model_tag)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _cleanup_ephemeral(
|
|
52
|
+
adapter: OllamaAdapter,
|
|
53
|
+
pulled: List[str],
|
|
54
|
+
baseline: Set[str],
|
|
55
|
+
protected: Set[str],
|
|
56
|
+
conn: Any,
|
|
57
|
+
group_id: str,
|
|
58
|
+
) -> None:
|
|
59
|
+
for tag in pulled:
|
|
60
|
+
if tag in baseline:
|
|
61
|
+
continue
|
|
62
|
+
if tag in protected:
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
adapter.delete_model(tag)
|
|
66
|
+
store.insert_model_event(conn, group_id, "delete", tag)
|
|
67
|
+
except Exception as exc: # noqa: BLE001
|
|
68
|
+
logger.warning("filter_lab cleanup delete failed for %s: %s", tag, exc)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def process_job_group_sync(group_id: str) -> None:
|
|
72
|
+
"""Execute all queued runs for a group serially; policy B cleanup on terminal state."""
|
|
73
|
+
conn = get_db_connection()
|
|
74
|
+
if not conn:
|
|
75
|
+
logger.error("filter_lab: no DB for group %s", group_id)
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
row = store.get_group(conn, group_id)
|
|
79
|
+
if not row:
|
|
80
|
+
return
|
|
81
|
+
group = dict(row)
|
|
82
|
+
status = group["status"]
|
|
83
|
+
if status in ("completed", "failed", "cancelled"):
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
adapter: OllamaAdapter | None = None
|
|
87
|
+
pulled: List[str] = []
|
|
88
|
+
baseline_set: Set[str] = set()
|
|
89
|
+
protected: Set[str] = set()
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
try:
|
|
93
|
+
eff = resolve_sanitization_ollama_effective(settings, conn)
|
|
94
|
+
except Exception as exc: # noqa: BLE001
|
|
95
|
+
logger.error("filter_lab: effective config failed: %s", exc)
|
|
96
|
+
store.update_group_status(conn, group_id, "failed")
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
# Pipeline "sanitization Ollama enabled" can be off while the user still wants Lab eval.
|
|
100
|
+
# Use the same host / models / limits; only skip blocking on eff.enabled.
|
|
101
|
+
if not eff.enabled:
|
|
102
|
+
logger.info(
|
|
103
|
+
"filter_lab: sanitization pipeline is disabled in config; running Lab eval to Ollama anyway (group=%s)",
|
|
104
|
+
group_id,
|
|
105
|
+
)
|
|
106
|
+
eff_for_lab = eff.model_copy(update={"enabled": True})
|
|
107
|
+
|
|
108
|
+
opts: Dict[str, Any] = {}
|
|
109
|
+
try:
|
|
110
|
+
opts = json.loads(group.get("options_json") or "{}")
|
|
111
|
+
except json.JSONDecodeError:
|
|
112
|
+
opts = {}
|
|
113
|
+
|
|
114
|
+
eff_merged = eff_for_lab.model_copy(
|
|
115
|
+
update={
|
|
116
|
+
"timeout_sec": float(opts["timeout_sec"])
|
|
117
|
+
if isinstance(opts.get("timeout_sec"), (int, float))
|
|
118
|
+
else eff.timeout_sec,
|
|
119
|
+
"max_input_chars": int(opts["max_input_chars"])
|
|
120
|
+
if isinstance(opts.get("max_input_chars"), int)
|
|
121
|
+
else eff.max_input_chars,
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
bundle = bundles_mod.get_bundle(group["bundle_id"])
|
|
126
|
+
if not bundle:
|
|
127
|
+
store.update_group_status(conn, group_id, "failed")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
baseline_list = list(json.loads(group.get("baseline_models_json") or "[]"))
|
|
131
|
+
baseline_set = set(baseline_list)
|
|
132
|
+
pulled = list(json.loads(group.get("pulled_models_json") or "[]"))
|
|
133
|
+
protected = _protection_tags(eff_merged)
|
|
134
|
+
adapter = OllamaAdapter(base_url=eff_merged.host)
|
|
135
|
+
|
|
136
|
+
store.update_group_status(conn, group_id, "running")
|
|
137
|
+
|
|
138
|
+
for run_row in store.list_runs(conn, group_id):
|
|
139
|
+
run = dict(run_row)
|
|
140
|
+
if run["status"] != "queued":
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
g2 = dict(store.get_group(conn, group_id) or {})
|
|
144
|
+
if g2.get("status") == "cancelled":
|
|
145
|
+
store.update_run(
|
|
146
|
+
conn,
|
|
147
|
+
run["id"],
|
|
148
|
+
status="cancelled",
|
|
149
|
+
finished_at=store.utc_now_iso(),
|
|
150
|
+
)
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
rec_id = run["record_id"]
|
|
154
|
+
model_tag = run["model_tag"]
|
|
155
|
+
record = next(
|
|
156
|
+
(r for r in (bundle.get("records") or []) if str(r.get("id")) == rec_id),
|
|
157
|
+
None,
|
|
158
|
+
)
|
|
159
|
+
if not record:
|
|
160
|
+
store.update_run(
|
|
161
|
+
conn,
|
|
162
|
+
run["id"],
|
|
163
|
+
status="failed",
|
|
164
|
+
finished_at=store.utc_now_iso(),
|
|
165
|
+
error_code="UNKNOWN_RECORD",
|
|
166
|
+
)
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
text = bundles_mod.record_text(record)
|
|
170
|
+
max_c = eff_merged.max_input_chars
|
|
171
|
+
if max_c > 0 and len(text) > max_c:
|
|
172
|
+
store.update_run(
|
|
173
|
+
conn,
|
|
174
|
+
run["id"],
|
|
175
|
+
status="failed",
|
|
176
|
+
finished_at=store.utc_now_iso(),
|
|
177
|
+
error_code="INPUT_TOO_LARGE",
|
|
178
|
+
input_hash=_input_hash(text),
|
|
179
|
+
input_text=text[: max_c + 50],
|
|
180
|
+
)
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
store.update_run(
|
|
184
|
+
conn,
|
|
185
|
+
run["id"],
|
|
186
|
+
status="running",
|
|
187
|
+
started_at=store.utc_now_iso(),
|
|
188
|
+
input_hash=_input_hash(text),
|
|
189
|
+
input_text=text[:8000],
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
assert adapter is not None
|
|
194
|
+
_ensure_model_pulled(adapter, model_tag, baseline_set, pulled, conn, group_id)
|
|
195
|
+
t0 = time.perf_counter()
|
|
196
|
+
out = apply_text_transform_with_ollama(
|
|
197
|
+
text,
|
|
198
|
+
group["filter_id"],
|
|
199
|
+
None,
|
|
200
|
+
effective=eff_merged,
|
|
201
|
+
model_override=model_tag,
|
|
202
|
+
)
|
|
203
|
+
ms = int((time.perf_counter() - t0) * 1000)
|
|
204
|
+
store.update_run(
|
|
205
|
+
conn,
|
|
206
|
+
run["id"],
|
|
207
|
+
status="succeeded",
|
|
208
|
+
finished_at=store.utc_now_iso(),
|
|
209
|
+
latency_ms=ms,
|
|
210
|
+
output_text=out,
|
|
211
|
+
metrics_json=json.dumps({"input_chars": len(text), "output_chars": len(out)}),
|
|
212
|
+
)
|
|
213
|
+
except Exception as exc: # noqa: BLE001
|
|
214
|
+
logger.warning("filter_lab run failed: %s", exc)
|
|
215
|
+
store.update_run(
|
|
216
|
+
conn,
|
|
217
|
+
run["id"],
|
|
218
|
+
status="failed",
|
|
219
|
+
finished_at=store.utc_now_iso(),
|
|
220
|
+
error_code="RUN_ERROR",
|
|
221
|
+
output_text=str(exc)[:2000],
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
g3 = dict(store.get_group(conn, group_id) or {})
|
|
225
|
+
if g3.get("status") == "cancelled":
|
|
226
|
+
for r in store.list_runs(conn, group_id):
|
|
227
|
+
rd = dict(r)
|
|
228
|
+
if rd["status"] == "queued":
|
|
229
|
+
store.update_run(
|
|
230
|
+
conn,
|
|
231
|
+
rd["id"],
|
|
232
|
+
status="cancelled",
|
|
233
|
+
finished_at=store.utc_now_iso(),
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
store.update_group_status(conn, group_id, "completed")
|
|
237
|
+
|
|
238
|
+
finally:
|
|
239
|
+
if adapter and conn:
|
|
240
|
+
try:
|
|
241
|
+
row_f = store.get_group(conn, group_id)
|
|
242
|
+
if row_f:
|
|
243
|
+
group_f = dict(row_f)
|
|
244
|
+
pulled_f = list(json.loads(group_f.get("pulled_models_json") or "[]"))
|
|
245
|
+
baseline_f = set(json.loads(group_f.get("baseline_models_json") or "[]"))
|
|
246
|
+
eff2 = resolve_sanitization_ollama_effective(settings, conn)
|
|
247
|
+
prot2 = _protection_tags(eff2)
|
|
248
|
+
_cleanup_ephemeral(adapter, pulled_f, baseline_f, prot2, conn, group_id)
|
|
249
|
+
except Exception as exc: # noqa: BLE001
|
|
250
|
+
logger.warning("filter_lab cleanup skipped: %s", exc)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from .config.settings import settings
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("topos.hosted_pool_lease")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_iso_datetime(value: Any) -> Optional[datetime]:
|
|
19
|
+
raw = str(value or "").strip()
|
|
20
|
+
if not raw:
|
|
21
|
+
return None
|
|
22
|
+
try:
|
|
23
|
+
parsed = datetime.fromisoformat(raw.replace("Z", "+00:00"))
|
|
24
|
+
if parsed.tzinfo is None:
|
|
25
|
+
parsed = parsed.replace(tzinfo=timezone.utc)
|
|
26
|
+
return parsed.astimezone(timezone.utc)
|
|
27
|
+
except Exception: # noqa: BLE001
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _control_plane_http_base(ws_url: str) -> str:
|
|
32
|
+
parsed = urlparse(ws_url)
|
|
33
|
+
if parsed.scheme not in {"ws", "wss"}:
|
|
34
|
+
raise ValueError(f"Unsupported control plane websocket URL scheme: {parsed.scheme}")
|
|
35
|
+
http_scheme = "https" if parsed.scheme == "wss" else "http"
|
|
36
|
+
netloc = parsed.netloc
|
|
37
|
+
if not netloc:
|
|
38
|
+
raise ValueError("Control plane websocket URL missing host")
|
|
39
|
+
return f"{http_scheme}://{netloc}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _metadata_identity_token(audience: str) -> str:
|
|
43
|
+
metadata_url = (
|
|
44
|
+
"http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity"
|
|
45
|
+
)
|
|
46
|
+
params = {"audience": audience, "format": "full"}
|
|
47
|
+
headers = {"Metadata-Flavor": "Google"}
|
|
48
|
+
with httpx.Client(timeout=5.0) as client:
|
|
49
|
+
resp = client.get(metadata_url, params=params, headers=headers)
|
|
50
|
+
resp.raise_for_status()
|
|
51
|
+
token = str(resp.text or "").strip()
|
|
52
|
+
if not token:
|
|
53
|
+
raise RuntimeError("Metadata server returned an empty identity token")
|
|
54
|
+
return token
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class HostedPoolLease:
|
|
59
|
+
connector_key: str
|
|
60
|
+
lease_expires_at: Optional[datetime]
|
|
61
|
+
lease_ttl_seconds: int
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class HostedPoolLeaseClient:
|
|
65
|
+
def __init__(self, *, control_plane_ws_url: str) -> None:
|
|
66
|
+
self.control_plane_ws_url = control_plane_ws_url
|
|
67
|
+
self.control_plane_http_base = _control_plane_http_base(control_plane_ws_url)
|
|
68
|
+
self.issue_path = str(settings.hosted_pool_lease_issue_path or "/v1/system/pool-connectors/lease/issue")
|
|
69
|
+
self.renew_path = str(settings.hosted_pool_lease_renew_path or "/v1/system/pool-connectors/lease/renew")
|
|
70
|
+
self.revoke_path = str(settings.hosted_pool_lease_revoke_path or "/v1/system/pool-connectors/lease/revoke")
|
|
71
|
+
self.pool_group = str(settings.hosted_pool_lease_pool_group or "default")
|
|
72
|
+
self.instance_id = str(os.getenv("HOSTNAME") or "unknown-instance").strip()
|
|
73
|
+
self.service_name = str(os.getenv("K_SERVICE") or "unknown-service").strip()
|
|
74
|
+
self.revision = str(os.getenv("K_REVISION") or "").strip() or None
|
|
75
|
+
self.lease: Optional[HostedPoolLease] = None
|
|
76
|
+
|
|
77
|
+
def _build_url(self, path: str) -> str:
|
|
78
|
+
normalized = f"/{str(path or '').lstrip('/')}"
|
|
79
|
+
return f"{self.control_plane_http_base}{normalized}"
|
|
80
|
+
|
|
81
|
+
def _audience(self) -> str:
|
|
82
|
+
configured = str(settings.hosted_pool_lease_audience or "").strip()
|
|
83
|
+
return configured or self.control_plane_http_base
|
|
84
|
+
|
|
85
|
+
def _identity_token(self) -> str:
|
|
86
|
+
return _metadata_identity_token(self._audience())
|
|
87
|
+
|
|
88
|
+
async def issue(self) -> HostedPoolLease:
|
|
89
|
+
url = self._build_url(self.issue_path)
|
|
90
|
+
token = await asyncio.to_thread(self._identity_token)
|
|
91
|
+
payload: Dict[str, Any] = {
|
|
92
|
+
"service_name": self.service_name,
|
|
93
|
+
"revision": self.revision,
|
|
94
|
+
"instance_id": self.instance_id,
|
|
95
|
+
"pool_group": self.pool_group,
|
|
96
|
+
}
|
|
97
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
98
|
+
resp = await client.post(url, headers={"Authorization": f"Bearer {token}"}, json=payload)
|
|
99
|
+
resp.raise_for_status()
|
|
100
|
+
body = resp.json()
|
|
101
|
+
connector_key = str(body.get("connector_key") or "").strip()
|
|
102
|
+
if not connector_key:
|
|
103
|
+
raise RuntimeError("Lease issue response missing connector_key")
|
|
104
|
+
ttl = int(body.get("lease_ttl_seconds") or 300)
|
|
105
|
+
lease = HostedPoolLease(
|
|
106
|
+
connector_key=connector_key,
|
|
107
|
+
lease_expires_at=_parse_iso_datetime(body.get("lease_expires_at")),
|
|
108
|
+
lease_ttl_seconds=max(30, ttl),
|
|
109
|
+
)
|
|
110
|
+
self.lease = lease
|
|
111
|
+
return lease
|
|
112
|
+
|
|
113
|
+
async def renew(self) -> HostedPoolLease:
|
|
114
|
+
if not self.lease:
|
|
115
|
+
return await self.issue()
|
|
116
|
+
url = self._build_url(self.renew_path)
|
|
117
|
+
token = await asyncio.to_thread(self._identity_token)
|
|
118
|
+
payload = {
|
|
119
|
+
"connector_key": self.lease.connector_key,
|
|
120
|
+
"service_name": self.service_name,
|
|
121
|
+
"revision": self.revision,
|
|
122
|
+
"instance_id": self.instance_id,
|
|
123
|
+
}
|
|
124
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
125
|
+
resp = await client.post(url, headers={"Authorization": f"Bearer {token}"}, json=payload)
|
|
126
|
+
resp.raise_for_status()
|
|
127
|
+
body = resp.json()
|
|
128
|
+
ttl = int(body.get("lease_ttl_seconds") or self.lease.lease_ttl_seconds)
|
|
129
|
+
self.lease = HostedPoolLease(
|
|
130
|
+
connector_key=self.lease.connector_key,
|
|
131
|
+
lease_expires_at=_parse_iso_datetime(body.get("lease_expires_at")),
|
|
132
|
+
lease_ttl_seconds=max(30, ttl),
|
|
133
|
+
)
|
|
134
|
+
return self.lease
|
|
135
|
+
|
|
136
|
+
async def revoke(self) -> None:
|
|
137
|
+
if not self.lease:
|
|
138
|
+
return
|
|
139
|
+
url = self._build_url(self.revoke_path)
|
|
140
|
+
token = await asyncio.to_thread(self._identity_token)
|
|
141
|
+
payload = {
|
|
142
|
+
"connector_key": self.lease.connector_key,
|
|
143
|
+
"service_name": self.service_name,
|
|
144
|
+
"instance_id": self.instance_id,
|
|
145
|
+
}
|
|
146
|
+
try:
|
|
147
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
148
|
+
resp = await client.post(url, headers={"Authorization": f"Bearer {token}"}, json=payload)
|
|
149
|
+
resp.raise_for_status()
|
|
150
|
+
except Exception as exc: # noqa: BLE001
|
|
151
|
+
logger.warning("Hosted pool lease revoke failed: %s", exc)
|
|
152
|
+
finally:
|
|
153
|
+
self.lease = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Ingestion layer for Topos."""
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Checkpoint storage abstractions."""
|
|
2
|
+
|
|
3
|
+
from .checkpoint_store import CheckpointStore, IngestionCheckpoint
|
|
4
|
+
from .sqlite_checkpoint_store import SqliteCheckpointStore, ensure_table
|
|
5
|
+
|
|
6
|
+
__all__ = ["CheckpointStore", "IngestionCheckpoint", "SqliteCheckpointStore", "ensure_table"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Checkpoint store contract for ingestion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class IngestionCheckpoint:
|
|
11
|
+
dataset_id: str
|
|
12
|
+
schema_id: str
|
|
13
|
+
last_record_id: str
|
|
14
|
+
metadata: Dict[str, str]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CheckpointStore:
|
|
18
|
+
"""Persist and retrieve ingestion checkpoints."""
|
|
19
|
+
|
|
20
|
+
def get_checkpoint(self, dataset_id: str, schema_id: str) -> Optional[IngestionCheckpoint]:
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
def save_checkpoint(self, checkpoint: IngestionCheckpoint) -> None:
|
|
24
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""SQLite-backed checkpoint store for ingestion (e.g. iMessage/Signal sync)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from .checkpoint_store import CheckpointStore, IngestionCheckpoint
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("topos.ingestion.checkpoints.sqlite")
|
|
12
|
+
|
|
13
|
+
TABLE = "ingestion_checkpoints"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def ensure_table(conn) -> None:
|
|
17
|
+
"""Create ingestion_checkpoints table if not exists."""
|
|
18
|
+
conn.execute(f"""
|
|
19
|
+
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
20
|
+
dataset_id TEXT NOT NULL,
|
|
21
|
+
schema_id TEXT NOT NULL,
|
|
22
|
+
last_record_id TEXT NOT NULL,
|
|
23
|
+
metadata_json TEXT,
|
|
24
|
+
updated_at TEXT DEFAULT (datetime('now')),
|
|
25
|
+
PRIMARY KEY (dataset_id, schema_id)
|
|
26
|
+
)
|
|
27
|
+
""")
|
|
28
|
+
conn.commit()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SqliteCheckpointStore(CheckpointStore):
|
|
32
|
+
"""Persist checkpoints to SQLite (same DB as app)."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, conn) -> None:
|
|
35
|
+
self.conn = conn
|
|
36
|
+
if conn:
|
|
37
|
+
ensure_table(conn)
|
|
38
|
+
|
|
39
|
+
def get_checkpoint(self, dataset_id: str, schema_id: str) -> Optional[IngestionCheckpoint]:
|
|
40
|
+
if not self.conn:
|
|
41
|
+
return None
|
|
42
|
+
try:
|
|
43
|
+
ensure_table(self.conn)
|
|
44
|
+
row = self.conn.execute(
|
|
45
|
+
f"SELECT last_record_id, metadata_json FROM {TABLE} WHERE dataset_id = ? AND schema_id = ?",
|
|
46
|
+
(dataset_id, schema_id),
|
|
47
|
+
).fetchone()
|
|
48
|
+
if not row:
|
|
49
|
+
return None
|
|
50
|
+
last_record_id, metadata_json = row
|
|
51
|
+
metadata = {}
|
|
52
|
+
if metadata_json:
|
|
53
|
+
try:
|
|
54
|
+
metadata = json.loads(metadata_json)
|
|
55
|
+
except (TypeError, json.JSONDecodeError):
|
|
56
|
+
pass
|
|
57
|
+
return IngestionCheckpoint(
|
|
58
|
+
dataset_id=dataset_id,
|
|
59
|
+
schema_id=schema_id,
|
|
60
|
+
last_record_id=last_record_id or "0",
|
|
61
|
+
metadata=metadata,
|
|
62
|
+
)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.warning("get_checkpoint failed: %s", e)
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def save_checkpoint(self, checkpoint: IngestionCheckpoint) -> None:
|
|
68
|
+
if not self.conn:
|
|
69
|
+
return
|
|
70
|
+
try:
|
|
71
|
+
ensure_table(self.conn)
|
|
72
|
+
metadata_json = json.dumps(checkpoint.metadata, ensure_ascii=False) if checkpoint.metadata else None
|
|
73
|
+
self.conn.execute(
|
|
74
|
+
f"""
|
|
75
|
+
INSERT OR REPLACE INTO {TABLE} (dataset_id, schema_id, last_record_id, metadata_json, updated_at)
|
|
76
|
+
VALUES (?, ?, ?, ?, datetime('now'))
|
|
77
|
+
""",
|
|
78
|
+
(checkpoint.dataset_id, checkpoint.schema_id, checkpoint.last_record_id, metadata_json),
|
|
79
|
+
)
|
|
80
|
+
self.conn.commit()
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.warning("save_checkpoint failed: %s", e)
|