topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,122 @@
1
+ """Progress bar utility for enrichment jobs.
2
+
3
+ Provides a single-line updating progress bar similar to transformers library.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ import time
10
+ from typing import Optional
11
+
12
+
13
+ class ProgressBar:
14
+ """Single-line progress bar that updates in place."""
15
+
16
+ def __init__(
17
+ self,
18
+ total: int,
19
+ desc: str = "",
20
+ width: int = 50,
21
+ file: Optional[object] = None,
22
+ ):
23
+ """Initialize progress bar.
24
+
25
+ Args:
26
+ total: Total number of items to process
27
+ desc: Description prefix for the progress bar
28
+ width: Width of the progress bar in characters
29
+ file: File object to write to (defaults to stderr)
30
+ """
31
+ self.total = total
32
+ self.desc = desc
33
+ self.width = width
34
+ self.file = file or sys.stderr
35
+ self.n = 0
36
+ self.start_time = time.time()
37
+ self.last_update_time = self.start_time
38
+ self._isatty = hasattr(self.file, 'isatty') and self.file.isatty()
39
+ self._last_line_length = 0
40
+
41
+ def update(self, n: int = 1) -> None:
42
+ """Update progress by n items.
43
+
44
+ Args:
45
+ n: Number of items to advance (default 1)
46
+ """
47
+ self.n = min(self.n + n, self.total)
48
+ self._display()
49
+
50
+ def set_description(self, desc: str) -> None:
51
+ """Update the description prefix.
52
+
53
+ Args:
54
+ desc: New description
55
+ """
56
+ self.desc = desc
57
+
58
+ def _display(self) -> None:
59
+ """Display/update the progress bar."""
60
+ if not self._isatty:
61
+ # If not a TTY, just print periodic updates
62
+ if self.n % max(1, self.total // 10) == 0 or self.n == self.total:
63
+ elapsed = time.time() - self.start_time
64
+ percent = (self.n / self.total * 100) if self.total > 0 else 0
65
+ print(
66
+ f"\r{self.desc}: {self.n}/{self.total} ({percent:.1f}%) "
67
+ f"[{elapsed:.1f}s]",
68
+ end="",
69
+ file=self.file,
70
+ flush=True,
71
+ )
72
+ return
73
+
74
+ # Calculate progress
75
+ percent = (self.n / self.total * 100) if self.total > 0 else 0
76
+ elapsed = time.time() - self.start_time
77
+
78
+ # Calculate rate
79
+ if elapsed > 0 and self.n > 0:
80
+ rate = self.n / elapsed
81
+ if self.n < self.total:
82
+ eta = (self.total - self.n) / rate
83
+ eta_str = f", ETA: {eta:.1f}s"
84
+ else:
85
+ eta_str = ""
86
+ else:
87
+ rate = 0
88
+ eta_str = ""
89
+
90
+ # Build progress bar
91
+ filled = int(self.width * self.n / self.total) if self.total > 0 else 0
92
+ bar = "█" * filled + "░" * (self.width - filled)
93
+
94
+ # Build status string
95
+ status = f"{self.desc}: {percent:5.1f}%|{bar}| {self.n}/{self.total} [{elapsed:.1f}s{eta_str}]"
96
+
97
+ # Clear previous line and print new one
98
+ # Use carriage return and clear to end of line
99
+ print(f"\r{' ' * self._last_line_length}\r{status}", end="", file=self.file, flush=True)
100
+ self._last_line_length = len(status)
101
+
102
+ def close(self) -> None:
103
+ """Close the progress bar (print final newline)."""
104
+ if self._isatty:
105
+ print(file=self.file) # Newline to move past progress bar
106
+ self._last_line_length = 0
107
+
108
+ def __enter__(self):
109
+ """Context manager entry."""
110
+ self._display()
111
+ return self
112
+
113
+ def __exit__(self, exc_type, exc_val, exc_tb):
114
+ """Context manager exit."""
115
+ self.close()
116
+
117
+ def __del__(self):
118
+ """Cleanup on deletion."""
119
+ try:
120
+ self.close()
121
+ except Exception:
122
+ pass
@@ -0,0 +1,31 @@
1
+ """
2
+ URL classification for browser visits. Thin wrapper over the Engine (Sprint 03).
3
+
4
+ Prefer building a ProcessingTask and calling Engine.run() directly. This module
5
+ remains for backward compatibility with any code that still calls classify_url().
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, Optional
11
+
12
+ from ..engine import Engine, build_url_classification_task
13
+
14
+
15
+ def classify_url(url: str, title: Optional[str] = None) -> Dict[str, Any]:
16
+ """
17
+ Classify a URL (and optional title) into a category. Uses the Engine.
18
+ Returns dict with category, confidence, model (same shape as before migration).
19
+ """
20
+ if not isinstance(url, str) or not url.strip():
21
+ raise ValueError("url must be a non-empty string")
22
+ task = build_url_classification_task(
23
+ task_id="website_classifier",
24
+ url=url.strip(),
25
+ title=title,
26
+ )
27
+ engine = Engine()
28
+ result = engine.run(task)
29
+ if result.status != "completed":
30
+ raise RuntimeError(result.error or f"Engine returned status {result.status}")
31
+ return result.output
@@ -0,0 +1 @@
1
+ """Filter Lab: eval job groups, preset bundles, Ollama compare (SQLite + API)."""
@@ -0,0 +1,300 @@
1
+ """Versioned preset bundles for Filter Lab (synthetic text only)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from topos.config.sanitization_ollama import SANITIZATION_OLLAMA_TRANSFORM_IDS
8
+
9
+ # Primary text field key in bundle records
10
+ TEXT_FIELD = "body"
11
+
12
+ SANITIZATION_CATEGORY = "sanitization"
13
+
14
+ # How strongly a bundle matches an Ollama transform (all remain runnable in Lab).
15
+ FIT_RECOMMENDED = "recommended" # strong, primary eval signal for this transform
16
+ FIT_SUPPORTED = "supported" # valid run; weaker or generic outcome expected
17
+ FIT_STRESS = "stress" # edge / policy stress — useful but easy to misread
18
+
19
+
20
+ def _full_filter_fit(overrides: Dict[str, str]) -> Dict[str, str]:
21
+ out: Dict[str, str] = {tid: FIT_SUPPORTED for tid in SANITIZATION_OLLAMA_TRANSFORM_IDS}
22
+ for k, v in overrides.items():
23
+ if k in out and v in (FIT_RECOMMENDED, FIT_SUPPORTED, FIT_STRESS):
24
+ out[k] = v
25
+ return out
26
+
27
+ # Cap per-record body size returned by GET /bundles/{id} (preview API).
28
+ _BUNDLE_PREVIEW_BODY_MAX_LEN = 8000
29
+
30
+ _BUNDLES: List[Dict[str, Any]] = [
31
+ {
32
+ "id": "lab.messages.casual",
33
+ "label": "Casual messages",
34
+ "description": "Short chat-style snippets (synthetic).",
35
+ "bundle_version": "2",
36
+ "max_input_chars": 12000,
37
+ "compatible_filter_categories": [SANITIZATION_CATEGORY],
38
+ "disclaimer_nsfw": False,
39
+ "filter_fit": _full_filter_fit(
40
+ {
41
+ "raw_to_summary": FIT_RECOMMENDED,
42
+ "raw_to_sentiment": FIT_RECOMMENDED,
43
+ "third_party_anonymization": FIT_SUPPORTED,
44
+ "name_removal": FIT_SUPPORTED,
45
+ "contact_removal": FIT_SUPPORTED,
46
+ "pii_redaction": FIT_STRESS,
47
+ "nsfw_sanitization": FIT_STRESS,
48
+ }
49
+ ),
50
+ "records": [
51
+ {"id": "m1", "body": "hey are we still on for coffee at 3?"},
52
+ {"id": "m2", "body": "lol ok just ping me when you're close"},
53
+ {"id": "m3", "body": "running 10 min late sorry!!"},
54
+ {"id": "m4", "body": "did you see the doc I shared in the channel?"},
55
+ {"id": "m5", "body": "sounds good, let's sync tomorrow morning"},
56
+ {"id": "m6", "body": "ugh this meeting could've been an email"},
57
+ {"id": "m7", "body": "bring snacks if you can — no pressure"},
58
+ {"id": "m8", "body": "haha yeah that's exactly what I meant"},
59
+ {"id": "m9", "body": "call me if anything blocks you"},
60
+ {"id": "m10", "body": "order pizza without me if I'm not there by 7"},
61
+ ],
62
+ },
63
+ {
64
+ "id": "lab.pii.synthetic",
65
+ "label": "Synthetic PII stress",
66
+ "description": "Fake names, emails, phones for redaction eval (not real people).",
67
+ "bundle_version": "2",
68
+ "max_input_chars": 12000,
69
+ "compatible_filter_categories": [SANITIZATION_CATEGORY],
70
+ "disclaimer_nsfw": False,
71
+ "filter_fit": _full_filter_fit(
72
+ {
73
+ "pii_redaction": FIT_RECOMMENDED,
74
+ "contact_removal": FIT_RECOMMENDED,
75
+ "name_removal": FIT_RECOMMENDED,
76
+ "third_party_anonymization": FIT_RECOMMENDED,
77
+ "raw_to_summary": FIT_SUPPORTED,
78
+ "raw_to_sentiment": FIT_SUPPORTED,
79
+ "nsfw_sanitization": FIT_STRESS,
80
+ }
81
+ ),
82
+ "records": [
83
+ {
84
+ "id": "p1",
85
+ "body": "Contact Jane Doe at jane.doe@example.com or +1-555-0100. Address: 123 Fake St, Springfield.",
86
+ },
87
+ {"id": "p2", "body": "Invoice #9921 for Acme Corp; remit to payments@acmecorp.example."},
88
+ {
89
+ "id": "p3",
90
+ "body": "Reach Priya Sharma at priya.sharma@fake-lab.example or WhatsApp +44 7700 900123.",
91
+ },
92
+ {
93
+ "id": "p4",
94
+ "body": "Ship to: 742 Evergreen Terrace, Springfield, IL 62704 — attn: Homer S.",
95
+ },
96
+ {
97
+ "id": "p5",
98
+ "body": "Patient MRN 883921; DOB 01/15/1980; emergency cousin.jane@hospital.test",
99
+ },
100
+ {
101
+ "id": "p6",
102
+ "body": "Wire transfer: routing 021000021, account ending 4521 (holder: John Q Public, Lab-Only).",
103
+ },
104
+ {
105
+ "id": "p7",
106
+ "body": "Driver license CA D1234567 — name: FAKEPERSON ONLYFORLAB, expires 2030-01-01.",
107
+ },
108
+ {"id": "p8", "body": "Tweet @totally_fake_handle DM for collab; backup email backup@social.test."},
109
+ {
110
+ "id": "p9",
111
+ "body": "Beneficiary: Robert Tables; robert.tables@sql-injection.test; desk phone +1-555-0199 ext 42.",
112
+ },
113
+ {
114
+ "id": "p10",
115
+ "body": "Join call: https://totally-fake-meeting.example/join?id=abc123 — passcode: secret123",
116
+ },
117
+ ],
118
+ },
119
+ {
120
+ "id": "lab.safety.edge_cases",
121
+ "label": "Safety / edge phrasing (educational)",
122
+ "description": "Borderline phrasing for policy-style filters; synthetic.",
123
+ "bundle_version": "2",
124
+ "max_input_chars": 12000,
125
+ "compatible_filter_categories": [SANITIZATION_CATEGORY],
126
+ "disclaimer_nsfw": True,
127
+ "filter_fit": _full_filter_fit(
128
+ {
129
+ "nsfw_sanitization": FIT_RECOMMENDED,
130
+ "raw_to_summary": FIT_SUPPORTED,
131
+ "raw_to_sentiment": FIT_SUPPORTED,
132
+ "pii_redaction": FIT_STRESS,
133
+ "third_party_anonymization": FIT_STRESS,
134
+ "name_removal": FIT_STRESS,
135
+ "contact_removal": FIT_STRESS,
136
+ }
137
+ ),
138
+ "records": [
139
+ {
140
+ "id": "s1",
141
+ "body": "This is a clinical description of anatomy for a biology exam.",
142
+ },
143
+ {"id": "s2", "body": "Report uses metaphorical language; no actionable harm."},
144
+ {
145
+ "id": "s3",
146
+ "body": "Politician used violent metaphor about 'crushing' opponents in debate.",
147
+ },
148
+ {
149
+ "id": "s4",
150
+ "body": "Historical fiction battle scene: strategic overview only, no gore.",
151
+ },
152
+ {
153
+ "id": "s5",
154
+ "body": "Harm-reduction pamphlet excerpt: factual risks of overdose, non-promotional (lab text).",
155
+ },
156
+ {
157
+ "id": "s6",
158
+ "body": "Crisis copy: if you are in crisis, contact a local hotline; 988-123-1234.",
159
+ },
160
+ {
161
+ "id": "s7",
162
+ "body": "Self-defense class outline: legal thresholds for use of force.",
163
+ },
164
+ {
165
+ "id": "s8",
166
+ "body": "Patient denies SI/HI; no acute psychosis documented.",
167
+ },
168
+ {
169
+ "id": "s9",
170
+ "body": "Why some titles earn a Mature label.",
171
+ },
172
+ {"id": "s10", "body": "How to fillet a fish safely with your knife and keep the blood to a minimum."},
173
+ ],
174
+ },
175
+ {
176
+ "id": "lab.nsfw.synthetic",
177
+ "label": "Synthetic NSFW & profanity stress",
178
+ "description": "Synthetic sexual references and strong language for nsfw_sanitization eval only.",
179
+ "bundle_version": "1",
180
+ "max_input_chars": 12000,
181
+ "compatible_filter_categories": [SANITIZATION_CATEGORY],
182
+ "disclaimer_nsfw": True,
183
+ "filter_fit": _full_filter_fit(
184
+ {
185
+ "nsfw_sanitization": FIT_RECOMMENDED,
186
+ "raw_to_summary": FIT_SUPPORTED,
187
+ "raw_to_sentiment": FIT_SUPPORTED,
188
+ "name_removal": FIT_SUPPORTED,
189
+ "third_party_anonymization": FIT_SUPPORTED,
190
+ "pii_redaction": FIT_STRESS,
191
+ "contact_removal": FIT_STRESS,
192
+ }
193
+ ),
194
+ "records": [
195
+ {"id": "n1", "body": "This fucking deploy script failed again and I'm sick of it."},
196
+ {"id": "n2", "body": "What the hell — did anyone even test this shit before shipping?"},
197
+ {
198
+ "id": "n3",
199
+ "body": "After the party we hooked up; awkward the next day but we laughed it off.",
200
+ },
201
+ {
202
+ "id": "n4",
203
+ "body": "She sent an explicit photo; I deleted it and said not to do that on a work thread.",
204
+ },
205
+ {
206
+ "id": "n5",
207
+ "body": "Romance draft: they kissed, clothes came off, his dick throbbed, her pussy dripped, scene fades.",
208
+ },
209
+ {
210
+ "id": "n6",
211
+ "body": "Porn pop-ups hijack the lab browser; IT should block that crap on shared machines.",
212
+ },
213
+ {"id": "n7", "body": "He called his boss a dick in standup; HR opened a conduct review."},
214
+ {
215
+ "id": "n8",
216
+ "body": "want u so bad tonight, can't wait to fuck you later",
217
+ },
218
+ {
219
+ "id": "n9",
220
+ "body": "Your service is bullshit and I'm charging back every goddamn cent.",
221
+ },
222
+ {
223
+ "id": "n10",
224
+ "body": "DTF tonight",
225
+ },
226
+ ],
227
+ },
228
+ ]
229
+
230
+
231
+ def list_bundle_metadata() -> List[Dict[str, Any]]:
232
+ """Public list for GET /bundles (no full record bodies if large — here include record count only)."""
233
+ out: List[Dict[str, Any]] = []
234
+ for b in _BUNDLES:
235
+ out.append(
236
+ {
237
+ "id": b["id"],
238
+ "label": b["label"],
239
+ "description": b.get("description"),
240
+ "bundle_version": b["bundle_version"],
241
+ "max_input_chars": b["max_input_chars"],
242
+ "compatible_filter_categories": b["compatible_filter_categories"],
243
+ "disclaimer_nsfw": bool(b.get("disclaimer_nsfw")),
244
+ "record_count": len(b.get("records") or []),
245
+ "filter_fit": dict(b.get("filter_fit") or {}),
246
+ }
247
+ )
248
+ return out
249
+
250
+
251
+ def get_bundle(bundle_id: str) -> Optional[Dict[str, Any]]:
252
+ for b in _BUNDLES:
253
+ if b["id"] == bundle_id:
254
+ return b
255
+ return None
256
+
257
+
258
+ def get_bundle_preview(bundle_id: str) -> Optional[Dict[str, Any]]:
259
+ """JSON shape for GET /v1/filter-lab/bundles/{bundle_id} (full record list for UI preview)."""
260
+ b = get_bundle(bundle_id)
261
+ if not b:
262
+ return None
263
+ records_out: List[Dict[str, str]] = []
264
+ for r in b.get("records") or []:
265
+ rid = r.get("id")
266
+ if rid is None:
267
+ continue
268
+ body = record_text(r)
269
+ if len(body) > _BUNDLE_PREVIEW_BODY_MAX_LEN:
270
+ body = body[:_BUNDLE_PREVIEW_BODY_MAX_LEN] + "\n[TRUNCATED]"
271
+ records_out.append({"id": str(rid), "body": body})
272
+ return {
273
+ "id": b["id"],
274
+ "label": b["label"],
275
+ "description": b.get("description"),
276
+ "bundle_version": b["bundle_version"],
277
+ "max_input_chars": b["max_input_chars"],
278
+ "compatible_filter_categories": list(b.get("compatible_filter_categories") or []),
279
+ "disclaimer_nsfw": bool(b.get("disclaimer_nsfw")),
280
+ "filter_fit": dict(b.get("filter_fit") or {}),
281
+ "records": records_out,
282
+ }
283
+
284
+
285
+ def record_text(record: Dict[str, Any]) -> str:
286
+ raw = record.get(TEXT_FIELD)
287
+ if not isinstance(raw, str):
288
+ return ""
289
+ return raw
290
+
291
+
292
+ def bundle_record_ids(bundle: Dict[str, Any]) -> List[str]:
293
+ return [str(r["id"]) for r in bundle.get("records") or [] if r.get("id") is not None]
294
+
295
+
296
+ def is_bundle_compatible_with_filter(bundle: Dict[str, Any], filter_id: str) -> bool:
297
+ if filter_id not in SANITIZATION_OLLAMA_TRANSFORM_IDS:
298
+ return False
299
+ cats = bundle.get("compatible_filter_categories") or []
300
+ return SANITIZATION_CATEGORY in cats
@@ -0,0 +1,86 @@
1
+ """SQLite DDL for Filter Lab tables (engine-owned)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sqlite3
7
+
8
+ logger = logging.getLogger("topos.filter_lab.schema")
9
+
10
+
11
+ def ensure_filter_lab_schema(conn: sqlite3.Connection) -> None:
12
+ """Create Filter Lab tables and indices if missing."""
13
+ try:
14
+ conn.execute("PRAGMA foreign_keys = ON")
15
+ conn.execute(
16
+ """
17
+ CREATE TABLE IF NOT EXISTS filter_lab_job_group (
18
+ id TEXT PRIMARY KEY,
19
+ created_at TEXT NOT NULL,
20
+ filter_id TEXT NOT NULL,
21
+ bundle_id TEXT NOT NULL,
22
+ bundle_version TEXT NOT NULL,
23
+ status TEXT NOT NULL,
24
+ baseline_models_json TEXT NOT NULL DEFAULT '[]',
25
+ pulled_models_json TEXT NOT NULL DEFAULT '[]',
26
+ notes TEXT,
27
+ preferred_model_tag TEXT,
28
+ group_notes TEXT,
29
+ options_json TEXT NOT NULL DEFAULT '{}'
30
+ )
31
+ """
32
+ )
33
+ conn.execute(
34
+ """
35
+ CREATE TABLE IF NOT EXISTS filter_lab_run (
36
+ id TEXT PRIMARY KEY,
37
+ group_id TEXT NOT NULL,
38
+ model_tag TEXT NOT NULL,
39
+ record_id TEXT NOT NULL,
40
+ status TEXT NOT NULL,
41
+ started_at TEXT,
42
+ finished_at TEXT,
43
+ latency_ms INTEGER,
44
+ error_code TEXT,
45
+ input_hash TEXT,
46
+ input_text TEXT,
47
+ output_text TEXT,
48
+ metrics_json TEXT,
49
+ user_quality_score_0_10 INTEGER,
50
+ user_liked INTEGER,
51
+ user_note TEXT,
52
+ rated_at TEXT,
53
+ FOREIGN KEY (group_id) REFERENCES filter_lab_job_group(id) ON DELETE CASCADE
54
+ )
55
+ """
56
+ )
57
+ conn.execute(
58
+ """
59
+ CREATE TABLE IF NOT EXISTS filter_lab_model_event (
60
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
61
+ group_id TEXT NOT NULL,
62
+ event_type TEXT NOT NULL,
63
+ model_tag TEXT NOT NULL,
64
+ created_at TEXT NOT NULL,
65
+ FOREIGN KEY (group_id) REFERENCES filter_lab_job_group(id) ON DELETE CASCADE
66
+ )
67
+ """
68
+ )
69
+ conn.execute(
70
+ "CREATE INDEX IF NOT EXISTS idx_filter_lab_job_group_filter_created "
71
+ "ON filter_lab_job_group (filter_id, created_at DESC)"
72
+ )
73
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_filter_lab_run_group ON filter_lab_run (group_id)")
74
+ cur = conn.execute("PRAGMA table_info(filter_lab_job_group)")
75
+ cols = [r[1] for r in cur.fetchall()]
76
+ if cols and "options_json" not in cols:
77
+ conn.execute(
78
+ "ALTER TABLE filter_lab_job_group ADD COLUMN options_json TEXT NOT NULL DEFAULT '{}'"
79
+ )
80
+ logger.info(
81
+ "filter_lab schema migrated: filter_lab_job_group.options_json added"
82
+ )
83
+ conn.commit()
84
+ except Exception as exc: # noqa: BLE001
85
+ logger.warning("ensure_filter_lab_schema failed: %s", exc)
86
+ raise