topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Progress bar utility for enrichment jobs.
|
|
2
|
+
|
|
3
|
+
Provides a single-line updating progress bar similar to transformers library.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProgressBar:
|
|
14
|
+
"""Single-line progress bar that updates in place."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
total: int,
|
|
19
|
+
desc: str = "",
|
|
20
|
+
width: int = 50,
|
|
21
|
+
file: Optional[object] = None,
|
|
22
|
+
):
|
|
23
|
+
"""Initialize progress bar.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
total: Total number of items to process
|
|
27
|
+
desc: Description prefix for the progress bar
|
|
28
|
+
width: Width of the progress bar in characters
|
|
29
|
+
file: File object to write to (defaults to stderr)
|
|
30
|
+
"""
|
|
31
|
+
self.total = total
|
|
32
|
+
self.desc = desc
|
|
33
|
+
self.width = width
|
|
34
|
+
self.file = file or sys.stderr
|
|
35
|
+
self.n = 0
|
|
36
|
+
self.start_time = time.time()
|
|
37
|
+
self.last_update_time = self.start_time
|
|
38
|
+
self._isatty = hasattr(self.file, 'isatty') and self.file.isatty()
|
|
39
|
+
self._last_line_length = 0
|
|
40
|
+
|
|
41
|
+
def update(self, n: int = 1) -> None:
|
|
42
|
+
"""Update progress by n items.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
n: Number of items to advance (default 1)
|
|
46
|
+
"""
|
|
47
|
+
self.n = min(self.n + n, self.total)
|
|
48
|
+
self._display()
|
|
49
|
+
|
|
50
|
+
def set_description(self, desc: str) -> None:
|
|
51
|
+
"""Update the description prefix.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
desc: New description
|
|
55
|
+
"""
|
|
56
|
+
self.desc = desc
|
|
57
|
+
|
|
58
|
+
def _display(self) -> None:
|
|
59
|
+
"""Display/update the progress bar."""
|
|
60
|
+
if not self._isatty:
|
|
61
|
+
# If not a TTY, just print periodic updates
|
|
62
|
+
if self.n % max(1, self.total // 10) == 0 or self.n == self.total:
|
|
63
|
+
elapsed = time.time() - self.start_time
|
|
64
|
+
percent = (self.n / self.total * 100) if self.total > 0 else 0
|
|
65
|
+
print(
|
|
66
|
+
f"\r{self.desc}: {self.n}/{self.total} ({percent:.1f}%) "
|
|
67
|
+
f"[{elapsed:.1f}s]",
|
|
68
|
+
end="",
|
|
69
|
+
file=self.file,
|
|
70
|
+
flush=True,
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
# Calculate progress
|
|
75
|
+
percent = (self.n / self.total * 100) if self.total > 0 else 0
|
|
76
|
+
elapsed = time.time() - self.start_time
|
|
77
|
+
|
|
78
|
+
# Calculate rate
|
|
79
|
+
if elapsed > 0 and self.n > 0:
|
|
80
|
+
rate = self.n / elapsed
|
|
81
|
+
if self.n < self.total:
|
|
82
|
+
eta = (self.total - self.n) / rate
|
|
83
|
+
eta_str = f", ETA: {eta:.1f}s"
|
|
84
|
+
else:
|
|
85
|
+
eta_str = ""
|
|
86
|
+
else:
|
|
87
|
+
rate = 0
|
|
88
|
+
eta_str = ""
|
|
89
|
+
|
|
90
|
+
# Build progress bar
|
|
91
|
+
filled = int(self.width * self.n / self.total) if self.total > 0 else 0
|
|
92
|
+
bar = "█" * filled + "░" * (self.width - filled)
|
|
93
|
+
|
|
94
|
+
# Build status string
|
|
95
|
+
status = f"{self.desc}: {percent:5.1f}%|{bar}| {self.n}/{self.total} [{elapsed:.1f}s{eta_str}]"
|
|
96
|
+
|
|
97
|
+
# Clear previous line and print new one
|
|
98
|
+
# Use carriage return and clear to end of line
|
|
99
|
+
print(f"\r{' ' * self._last_line_length}\r{status}", end="", file=self.file, flush=True)
|
|
100
|
+
self._last_line_length = len(status)
|
|
101
|
+
|
|
102
|
+
def close(self) -> None:
|
|
103
|
+
"""Close the progress bar (print final newline)."""
|
|
104
|
+
if self._isatty:
|
|
105
|
+
print(file=self.file) # Newline to move past progress bar
|
|
106
|
+
self._last_line_length = 0
|
|
107
|
+
|
|
108
|
+
def __enter__(self):
|
|
109
|
+
"""Context manager entry."""
|
|
110
|
+
self._display()
|
|
111
|
+
return self
|
|
112
|
+
|
|
113
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
114
|
+
"""Context manager exit."""
|
|
115
|
+
self.close()
|
|
116
|
+
|
|
117
|
+
def __del__(self):
|
|
118
|
+
"""Cleanup on deletion."""
|
|
119
|
+
try:
|
|
120
|
+
self.close()
|
|
121
|
+
except Exception:
|
|
122
|
+
pass
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL classification for browser visits. Thin wrapper over the Engine (Sprint 03).
|
|
3
|
+
|
|
4
|
+
Prefer building a ProcessingTask and calling Engine.run() directly. This module
|
|
5
|
+
remains for backward compatibility with any code that still calls classify_url().
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..engine import Engine, build_url_classification_task
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def classify_url(url: str, title: Optional[str] = None) -> Dict[str, Any]:
|
|
16
|
+
"""
|
|
17
|
+
Classify a URL (and optional title) into a category. Uses the Engine.
|
|
18
|
+
Returns dict with category, confidence, model (same shape as before migration).
|
|
19
|
+
"""
|
|
20
|
+
if not isinstance(url, str) or not url.strip():
|
|
21
|
+
raise ValueError("url must be a non-empty string")
|
|
22
|
+
task = build_url_classification_task(
|
|
23
|
+
task_id="website_classifier",
|
|
24
|
+
url=url.strip(),
|
|
25
|
+
title=title,
|
|
26
|
+
)
|
|
27
|
+
engine = Engine()
|
|
28
|
+
result = engine.run(task)
|
|
29
|
+
if result.status != "completed":
|
|
30
|
+
raise RuntimeError(result.error or f"Engine returned status {result.status}")
|
|
31
|
+
return result.output
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Filter Lab: eval job groups, preset bundles, Ollama compare (SQLite + API)."""
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""Versioned preset bundles for Filter Lab (synthetic text only)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from topos.config.sanitization_ollama import SANITIZATION_OLLAMA_TRANSFORM_IDS
|
|
8
|
+
|
|
9
|
+
# Primary text field key in bundle records
|
|
10
|
+
TEXT_FIELD = "body"
|
|
11
|
+
|
|
12
|
+
SANITIZATION_CATEGORY = "sanitization"
|
|
13
|
+
|
|
14
|
+
# How strongly a bundle matches an Ollama transform (all remain runnable in Lab).
|
|
15
|
+
FIT_RECOMMENDED = "recommended" # strong, primary eval signal for this transform
|
|
16
|
+
FIT_SUPPORTED = "supported" # valid run; weaker or generic outcome expected
|
|
17
|
+
FIT_STRESS = "stress" # edge / policy stress — useful but easy to misread
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _full_filter_fit(overrides: Dict[str, str]) -> Dict[str, str]:
|
|
21
|
+
out: Dict[str, str] = {tid: FIT_SUPPORTED for tid in SANITIZATION_OLLAMA_TRANSFORM_IDS}
|
|
22
|
+
for k, v in overrides.items():
|
|
23
|
+
if k in out and v in (FIT_RECOMMENDED, FIT_SUPPORTED, FIT_STRESS):
|
|
24
|
+
out[k] = v
|
|
25
|
+
return out
|
|
26
|
+
|
|
27
|
+
# Cap per-record body size returned by GET /bundles/{id} (preview API).
|
|
28
|
+
_BUNDLE_PREVIEW_BODY_MAX_LEN = 8000
|
|
29
|
+
|
|
30
|
+
_BUNDLES: List[Dict[str, Any]] = [
|
|
31
|
+
{
|
|
32
|
+
"id": "lab.messages.casual",
|
|
33
|
+
"label": "Casual messages",
|
|
34
|
+
"description": "Short chat-style snippets (synthetic).",
|
|
35
|
+
"bundle_version": "2",
|
|
36
|
+
"max_input_chars": 12000,
|
|
37
|
+
"compatible_filter_categories": [SANITIZATION_CATEGORY],
|
|
38
|
+
"disclaimer_nsfw": False,
|
|
39
|
+
"filter_fit": _full_filter_fit(
|
|
40
|
+
{
|
|
41
|
+
"raw_to_summary": FIT_RECOMMENDED,
|
|
42
|
+
"raw_to_sentiment": FIT_RECOMMENDED,
|
|
43
|
+
"third_party_anonymization": FIT_SUPPORTED,
|
|
44
|
+
"name_removal": FIT_SUPPORTED,
|
|
45
|
+
"contact_removal": FIT_SUPPORTED,
|
|
46
|
+
"pii_redaction": FIT_STRESS,
|
|
47
|
+
"nsfw_sanitization": FIT_STRESS,
|
|
48
|
+
}
|
|
49
|
+
),
|
|
50
|
+
"records": [
|
|
51
|
+
{"id": "m1", "body": "hey are we still on for coffee at 3?"},
|
|
52
|
+
{"id": "m2", "body": "lol ok just ping me when you're close"},
|
|
53
|
+
{"id": "m3", "body": "running 10 min late sorry!!"},
|
|
54
|
+
{"id": "m4", "body": "did you see the doc I shared in the channel?"},
|
|
55
|
+
{"id": "m5", "body": "sounds good, let's sync tomorrow morning"},
|
|
56
|
+
{"id": "m6", "body": "ugh this meeting could've been an email"},
|
|
57
|
+
{"id": "m7", "body": "bring snacks if you can — no pressure"},
|
|
58
|
+
{"id": "m8", "body": "haha yeah that's exactly what I meant"},
|
|
59
|
+
{"id": "m9", "body": "call me if anything blocks you"},
|
|
60
|
+
{"id": "m10", "body": "order pizza without me if I'm not there by 7"},
|
|
61
|
+
],
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"id": "lab.pii.synthetic",
|
|
65
|
+
"label": "Synthetic PII stress",
|
|
66
|
+
"description": "Fake names, emails, phones for redaction eval (not real people).",
|
|
67
|
+
"bundle_version": "2",
|
|
68
|
+
"max_input_chars": 12000,
|
|
69
|
+
"compatible_filter_categories": [SANITIZATION_CATEGORY],
|
|
70
|
+
"disclaimer_nsfw": False,
|
|
71
|
+
"filter_fit": _full_filter_fit(
|
|
72
|
+
{
|
|
73
|
+
"pii_redaction": FIT_RECOMMENDED,
|
|
74
|
+
"contact_removal": FIT_RECOMMENDED,
|
|
75
|
+
"name_removal": FIT_RECOMMENDED,
|
|
76
|
+
"third_party_anonymization": FIT_RECOMMENDED,
|
|
77
|
+
"raw_to_summary": FIT_SUPPORTED,
|
|
78
|
+
"raw_to_sentiment": FIT_SUPPORTED,
|
|
79
|
+
"nsfw_sanitization": FIT_STRESS,
|
|
80
|
+
}
|
|
81
|
+
),
|
|
82
|
+
"records": [
|
|
83
|
+
{
|
|
84
|
+
"id": "p1",
|
|
85
|
+
"body": "Contact Jane Doe at jane.doe@example.com or +1-555-0100. Address: 123 Fake St, Springfield.",
|
|
86
|
+
},
|
|
87
|
+
{"id": "p2", "body": "Invoice #9921 for Acme Corp; remit to payments@acmecorp.example."},
|
|
88
|
+
{
|
|
89
|
+
"id": "p3",
|
|
90
|
+
"body": "Reach Priya Sharma at priya.sharma@fake-lab.example or WhatsApp +44 7700 900123.",
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
"id": "p4",
|
|
94
|
+
"body": "Ship to: 742 Evergreen Terrace, Springfield, IL 62704 — attn: Homer S.",
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"id": "p5",
|
|
98
|
+
"body": "Patient MRN 883921; DOB 01/15/1980; emergency cousin.jane@hospital.test",
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"id": "p6",
|
|
102
|
+
"body": "Wire transfer: routing 021000021, account ending 4521 (holder: John Q Public, Lab-Only).",
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"id": "p7",
|
|
106
|
+
"body": "Driver license CA D1234567 — name: FAKEPERSON ONLYFORLAB, expires 2030-01-01.",
|
|
107
|
+
},
|
|
108
|
+
{"id": "p8", "body": "Tweet @totally_fake_handle DM for collab; backup email backup@social.test."},
|
|
109
|
+
{
|
|
110
|
+
"id": "p9",
|
|
111
|
+
"body": "Beneficiary: Robert Tables; robert.tables@sql-injection.test; desk phone +1-555-0199 ext 42.",
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"id": "p10",
|
|
115
|
+
"body": "Join call: https://totally-fake-meeting.example/join?id=abc123 — passcode: secret123",
|
|
116
|
+
},
|
|
117
|
+
],
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"id": "lab.safety.edge_cases",
|
|
121
|
+
"label": "Safety / edge phrasing (educational)",
|
|
122
|
+
"description": "Borderline phrasing for policy-style filters; synthetic.",
|
|
123
|
+
"bundle_version": "2",
|
|
124
|
+
"max_input_chars": 12000,
|
|
125
|
+
"compatible_filter_categories": [SANITIZATION_CATEGORY],
|
|
126
|
+
"disclaimer_nsfw": True,
|
|
127
|
+
"filter_fit": _full_filter_fit(
|
|
128
|
+
{
|
|
129
|
+
"nsfw_sanitization": FIT_RECOMMENDED,
|
|
130
|
+
"raw_to_summary": FIT_SUPPORTED,
|
|
131
|
+
"raw_to_sentiment": FIT_SUPPORTED,
|
|
132
|
+
"pii_redaction": FIT_STRESS,
|
|
133
|
+
"third_party_anonymization": FIT_STRESS,
|
|
134
|
+
"name_removal": FIT_STRESS,
|
|
135
|
+
"contact_removal": FIT_STRESS,
|
|
136
|
+
}
|
|
137
|
+
),
|
|
138
|
+
"records": [
|
|
139
|
+
{
|
|
140
|
+
"id": "s1",
|
|
141
|
+
"body": "This is a clinical description of anatomy for a biology exam.",
|
|
142
|
+
},
|
|
143
|
+
{"id": "s2", "body": "Report uses metaphorical language; no actionable harm."},
|
|
144
|
+
{
|
|
145
|
+
"id": "s3",
|
|
146
|
+
"body": "Politician used violent metaphor about 'crushing' opponents in debate.",
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
"id": "s4",
|
|
150
|
+
"body": "Historical fiction battle scene: strategic overview only, no gore.",
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"id": "s5",
|
|
154
|
+
"body": "Harm-reduction pamphlet excerpt: factual risks of overdose, non-promotional (lab text).",
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"id": "s6",
|
|
158
|
+
"body": "Crisis copy: if you are in crisis, contact a local hotline; 988-123-1234.",
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"id": "s7",
|
|
162
|
+
"body": "Self-defense class outline: legal thresholds for use of force.",
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"id": "s8",
|
|
166
|
+
"body": "Patient denies SI/HI; no acute psychosis documented.",
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"id": "s9",
|
|
170
|
+
"body": "Why some titles earn a Mature label.",
|
|
171
|
+
},
|
|
172
|
+
{"id": "s10", "body": "How to fillet a fish safely with your knife and keep the blood to a minimum."},
|
|
173
|
+
],
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
"id": "lab.nsfw.synthetic",
|
|
177
|
+
"label": "Synthetic NSFW & profanity stress",
|
|
178
|
+
"description": "Synthetic sexual references and strong language for nsfw_sanitization eval only.",
|
|
179
|
+
"bundle_version": "1",
|
|
180
|
+
"max_input_chars": 12000,
|
|
181
|
+
"compatible_filter_categories": [SANITIZATION_CATEGORY],
|
|
182
|
+
"disclaimer_nsfw": True,
|
|
183
|
+
"filter_fit": _full_filter_fit(
|
|
184
|
+
{
|
|
185
|
+
"nsfw_sanitization": FIT_RECOMMENDED,
|
|
186
|
+
"raw_to_summary": FIT_SUPPORTED,
|
|
187
|
+
"raw_to_sentiment": FIT_SUPPORTED,
|
|
188
|
+
"name_removal": FIT_SUPPORTED,
|
|
189
|
+
"third_party_anonymization": FIT_SUPPORTED,
|
|
190
|
+
"pii_redaction": FIT_STRESS,
|
|
191
|
+
"contact_removal": FIT_STRESS,
|
|
192
|
+
}
|
|
193
|
+
),
|
|
194
|
+
"records": [
|
|
195
|
+
{"id": "n1", "body": "This fucking deploy script failed again and I'm sick of it."},
|
|
196
|
+
{"id": "n2", "body": "What the hell — did anyone even test this shit before shipping?"},
|
|
197
|
+
{
|
|
198
|
+
"id": "n3",
|
|
199
|
+
"body": "After the party we hooked up; awkward the next day but we laughed it off.",
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"id": "n4",
|
|
203
|
+
"body": "She sent an explicit photo; I deleted it and said not to do that on a work thread.",
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
"id": "n5",
|
|
207
|
+
"body": "Romance draft: they kissed, clothes came off, his dick throbbed, her pussy dripped, scene fades.",
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"id": "n6",
|
|
211
|
+
"body": "Porn pop-ups hijack the lab browser; IT should block that crap on shared machines.",
|
|
212
|
+
},
|
|
213
|
+
{"id": "n7", "body": "He called his boss a dick in standup; HR opened a conduct review."},
|
|
214
|
+
{
|
|
215
|
+
"id": "n8",
|
|
216
|
+
"body": "want u so bad tonight, can't wait to fuck you later",
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
"id": "n9",
|
|
220
|
+
"body": "Your service is bullshit and I'm charging back every goddamn cent.",
|
|
221
|
+
},
|
|
222
|
+
{
|
|
223
|
+
"id": "n10",
|
|
224
|
+
"body": "DTF tonight",
|
|
225
|
+
},
|
|
226
|
+
],
|
|
227
|
+
},
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def list_bundle_metadata() -> List[Dict[str, Any]]:
|
|
232
|
+
"""Public list for GET /bundles (no full record bodies if large — here include record count only)."""
|
|
233
|
+
out: List[Dict[str, Any]] = []
|
|
234
|
+
for b in _BUNDLES:
|
|
235
|
+
out.append(
|
|
236
|
+
{
|
|
237
|
+
"id": b["id"],
|
|
238
|
+
"label": b["label"],
|
|
239
|
+
"description": b.get("description"),
|
|
240
|
+
"bundle_version": b["bundle_version"],
|
|
241
|
+
"max_input_chars": b["max_input_chars"],
|
|
242
|
+
"compatible_filter_categories": b["compatible_filter_categories"],
|
|
243
|
+
"disclaimer_nsfw": bool(b.get("disclaimer_nsfw")),
|
|
244
|
+
"record_count": len(b.get("records") or []),
|
|
245
|
+
"filter_fit": dict(b.get("filter_fit") or {}),
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
return out
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def get_bundle(bundle_id: str) -> Optional[Dict[str, Any]]:
|
|
252
|
+
for b in _BUNDLES:
|
|
253
|
+
if b["id"] == bundle_id:
|
|
254
|
+
return b
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_bundle_preview(bundle_id: str) -> Optional[Dict[str, Any]]:
|
|
259
|
+
"""JSON shape for GET /v1/filter-lab/bundles/{bundle_id} (full record list for UI preview)."""
|
|
260
|
+
b = get_bundle(bundle_id)
|
|
261
|
+
if not b:
|
|
262
|
+
return None
|
|
263
|
+
records_out: List[Dict[str, str]] = []
|
|
264
|
+
for r in b.get("records") or []:
|
|
265
|
+
rid = r.get("id")
|
|
266
|
+
if rid is None:
|
|
267
|
+
continue
|
|
268
|
+
body = record_text(r)
|
|
269
|
+
if len(body) > _BUNDLE_PREVIEW_BODY_MAX_LEN:
|
|
270
|
+
body = body[:_BUNDLE_PREVIEW_BODY_MAX_LEN] + "\n[TRUNCATED]"
|
|
271
|
+
records_out.append({"id": str(rid), "body": body})
|
|
272
|
+
return {
|
|
273
|
+
"id": b["id"],
|
|
274
|
+
"label": b["label"],
|
|
275
|
+
"description": b.get("description"),
|
|
276
|
+
"bundle_version": b["bundle_version"],
|
|
277
|
+
"max_input_chars": b["max_input_chars"],
|
|
278
|
+
"compatible_filter_categories": list(b.get("compatible_filter_categories") or []),
|
|
279
|
+
"disclaimer_nsfw": bool(b.get("disclaimer_nsfw")),
|
|
280
|
+
"filter_fit": dict(b.get("filter_fit") or {}),
|
|
281
|
+
"records": records_out,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def record_text(record: Dict[str, Any]) -> str:
|
|
286
|
+
raw = record.get(TEXT_FIELD)
|
|
287
|
+
if not isinstance(raw, str):
|
|
288
|
+
return ""
|
|
289
|
+
return raw
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def bundle_record_ids(bundle: Dict[str, Any]) -> List[str]:
|
|
293
|
+
return [str(r["id"]) for r in bundle.get("records") or [] if r.get("id") is not None]
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def is_bundle_compatible_with_filter(bundle: Dict[str, Any], filter_id: str) -> bool:
|
|
297
|
+
if filter_id not in SANITIZATION_OLLAMA_TRANSFORM_IDS:
|
|
298
|
+
return False
|
|
299
|
+
cats = bundle.get("compatible_filter_categories") or []
|
|
300
|
+
return SANITIZATION_CATEGORY in cats
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""SQLite DDL for Filter Lab tables (engine-owned)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sqlite3
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("topos.filter_lab.schema")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def ensure_filter_lab_schema(conn: sqlite3.Connection) -> None:
|
|
12
|
+
"""Create Filter Lab tables and indices if missing."""
|
|
13
|
+
try:
|
|
14
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
15
|
+
conn.execute(
|
|
16
|
+
"""
|
|
17
|
+
CREATE TABLE IF NOT EXISTS filter_lab_job_group (
|
|
18
|
+
id TEXT PRIMARY KEY,
|
|
19
|
+
created_at TEXT NOT NULL,
|
|
20
|
+
filter_id TEXT NOT NULL,
|
|
21
|
+
bundle_id TEXT NOT NULL,
|
|
22
|
+
bundle_version TEXT NOT NULL,
|
|
23
|
+
status TEXT NOT NULL,
|
|
24
|
+
baseline_models_json TEXT NOT NULL DEFAULT '[]',
|
|
25
|
+
pulled_models_json TEXT NOT NULL DEFAULT '[]',
|
|
26
|
+
notes TEXT,
|
|
27
|
+
preferred_model_tag TEXT,
|
|
28
|
+
group_notes TEXT,
|
|
29
|
+
options_json TEXT NOT NULL DEFAULT '{}'
|
|
30
|
+
)
|
|
31
|
+
"""
|
|
32
|
+
)
|
|
33
|
+
conn.execute(
|
|
34
|
+
"""
|
|
35
|
+
CREATE TABLE IF NOT EXISTS filter_lab_run (
|
|
36
|
+
id TEXT PRIMARY KEY,
|
|
37
|
+
group_id TEXT NOT NULL,
|
|
38
|
+
model_tag TEXT NOT NULL,
|
|
39
|
+
record_id TEXT NOT NULL,
|
|
40
|
+
status TEXT NOT NULL,
|
|
41
|
+
started_at TEXT,
|
|
42
|
+
finished_at TEXT,
|
|
43
|
+
latency_ms INTEGER,
|
|
44
|
+
error_code TEXT,
|
|
45
|
+
input_hash TEXT,
|
|
46
|
+
input_text TEXT,
|
|
47
|
+
output_text TEXT,
|
|
48
|
+
metrics_json TEXT,
|
|
49
|
+
user_quality_score_0_10 INTEGER,
|
|
50
|
+
user_liked INTEGER,
|
|
51
|
+
user_note TEXT,
|
|
52
|
+
rated_at TEXT,
|
|
53
|
+
FOREIGN KEY (group_id) REFERENCES filter_lab_job_group(id) ON DELETE CASCADE
|
|
54
|
+
)
|
|
55
|
+
"""
|
|
56
|
+
)
|
|
57
|
+
conn.execute(
|
|
58
|
+
"""
|
|
59
|
+
CREATE TABLE IF NOT EXISTS filter_lab_model_event (
|
|
60
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
61
|
+
group_id TEXT NOT NULL,
|
|
62
|
+
event_type TEXT NOT NULL,
|
|
63
|
+
model_tag TEXT NOT NULL,
|
|
64
|
+
created_at TEXT NOT NULL,
|
|
65
|
+
FOREIGN KEY (group_id) REFERENCES filter_lab_job_group(id) ON DELETE CASCADE
|
|
66
|
+
)
|
|
67
|
+
"""
|
|
68
|
+
)
|
|
69
|
+
conn.execute(
|
|
70
|
+
"CREATE INDEX IF NOT EXISTS idx_filter_lab_job_group_filter_created "
|
|
71
|
+
"ON filter_lab_job_group (filter_id, created_at DESC)"
|
|
72
|
+
)
|
|
73
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_filter_lab_run_group ON filter_lab_run (group_id)")
|
|
74
|
+
cur = conn.execute("PRAGMA table_info(filter_lab_job_group)")
|
|
75
|
+
cols = [r[1] for r in cur.fetchall()]
|
|
76
|
+
if cols and "options_json" not in cols:
|
|
77
|
+
conn.execute(
|
|
78
|
+
"ALTER TABLE filter_lab_job_group ADD COLUMN options_json TEXT NOT NULL DEFAULT '{}'"
|
|
79
|
+
)
|
|
80
|
+
logger.info(
|
|
81
|
+
"filter_lab schema migrated: filter_lab_job_group.options_json added"
|
|
82
|
+
)
|
|
83
|
+
conn.commit()
|
|
84
|
+
except Exception as exc: # noqa: BLE001
|
|
85
|
+
logger.warning("ensure_filter_lab_schema failed: %s", exc)
|
|
86
|
+
raise
|