topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
topos/uma_filters.py ADDED
@@ -0,0 +1,669 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import Any, Callable, Dict, List, Optional, Tuple
7
+
8
+ from shared.filtering import FieldTransform, FilterManifest, field_transforms_from_storage, filter_manifest_from_storage
9
+
10
+ logger = logging.getLogger("topos.uma_filters")
11
+
12
+
13
+ class UMAFilterError(ValueError):
14
+ """Raised when a manifest cannot be applied safely."""
15
+
16
+
17
+ def _parse_iso(s: str) -> Optional[datetime]:
18
+ try:
19
+ return datetime.fromisoformat(s.replace("Z", "+00:00"))
20
+ except Exception:
21
+ return None
22
+
23
+
24
+ def _normalize_datetime(value: Any) -> Optional[datetime]:
25
+ if value is None:
26
+ return None
27
+ if isinstance(value, datetime):
28
+ dt = value
29
+ elif isinstance(value, str):
30
+ dt = _parse_iso(value)
31
+ else:
32
+ return None
33
+ if dt is None:
34
+ return None
35
+ if dt.tzinfo is None:
36
+ return dt.replace(tzinfo=timezone.utc)
37
+ return dt.astimezone(timezone.utc)
38
+
39
+
40
+ def _resolve_temporal_datetime(row: Dict[str, Any]) -> Optional[datetime]:
41
+ """Best-effort timestamp resolver for UMA filters across canonical and raw tables."""
42
+ for field in ("event_at", "ts", "timestamp", "visited_at", "start_time", "end_time"):
43
+ dt = _normalize_datetime(row.get(field))
44
+ if dt is not None:
45
+ return dt
46
+ return None
47
+
48
+
49
+ def _resolve_time_field_for_rows(items: List[Dict[str, Any]]) -> str:
50
+ """Pick the first available temporal field from current rows."""
51
+ for field in ("event_at", "ts", "timestamp", "visited_at", "start_time", "end_time"):
52
+ for row in items:
53
+ if field in row and row.get(field) is not None:
54
+ return field
55
+ return "event_at"
56
+
57
+
58
+ def extract_filter_manifest(filters: Optional[Dict[str, Any]]) -> Optional[FilterManifest]:
59
+ if not filters or not isinstance(filters, dict):
60
+ return None
61
+ manifest = filters.get("filter_manifest")
62
+ if manifest is None:
63
+ return None
64
+ return filter_manifest_from_storage(manifest)
65
+
66
+
67
+ def extract_field_transforms(filters: Optional[Dict[str, Any]]) -> Optional[List[FieldTransform]]:
68
+ """Stage 10: Extract field_transforms list from permission filters payload."""
69
+ if not filters or not isinstance(filters, dict):
70
+ return None
71
+ return field_transforms_from_storage(filters.get("field_transforms"))
72
+
73
+
74
+ def _params_table_id(item_params: Dict[str, Any]) -> str:
75
+ return str(item_params.get("table_id") or "").strip()
76
+
77
+
78
+ def _resolve_rolling_window_days(manifest: Optional[FilterManifest], logical_table_id: Optional[str]) -> Optional[int]:
79
+ """Table-scoped rolling_window_days overrides the global (no table_id) entry for that table."""
80
+ if manifest is None:
81
+ return None
82
+ lt = (logical_table_id or "").strip()
83
+ global_days: Optional[int] = None
84
+ scoped_days: Optional[int] = None
85
+ for item in manifest.filters:
86
+ if item.filter_id != "rolling_window_days":
87
+ continue
88
+ days = item.params.get("days")
89
+ if not isinstance(days, int):
90
+ continue
91
+ tid = _params_table_id(item.params)
92
+ if tid:
93
+ if lt and tid == lt:
94
+ scoped_days = days
95
+ else:
96
+ global_days = days
97
+ if scoped_days is not None and lt:
98
+ return scoped_days
99
+ return global_days
100
+
101
+
102
+ def _resolve_row_caps(manifest: Optional[FilterManifest], logical_table_id: Optional[str]) -> List[int]:
103
+ """Applicable max_rows / most_recent_n counts: scoped overrides global for this logical table."""
104
+ if manifest is None:
105
+ return []
106
+ lt = (logical_table_id or "").strip()
107
+ scoped: List[int] = []
108
+ global_caps: List[int] = []
109
+ for item in manifest.filters:
110
+ if item.filter_id not in {"max_rows", "most_recent_n"}:
111
+ continue
112
+ count = item.params.get("count")
113
+ if not isinstance(count, int):
114
+ continue
115
+ tid = _params_table_id(item.params)
116
+ if tid:
117
+ if lt and tid == lt:
118
+ scoped.append(max(0, count))
119
+ else:
120
+ global_caps.append(max(0, count))
121
+ return scoped if scoped and lt else global_caps
122
+
123
+
124
+ def get_limit_cap(requested_limit: int, manifest: Optional[FilterManifest], logical_table_id: Optional[str] = None) -> int:
125
+ caps = _resolve_row_caps(manifest, logical_table_id)
126
+ if not caps:
127
+ return requested_limit
128
+ return min(requested_limit, min(caps))
129
+
130
+
131
+ def build_sql_constraints(
132
+ manifest: Optional[FilterManifest],
133
+ table_prefix: str,
134
+ logical_table_id: Optional[str] = None,
135
+ ) -> Tuple[str, List[Any]]:
136
+ if manifest is None:
137
+ return "", []
138
+ conditions: List[str] = []
139
+ params: List[Any] = []
140
+ eff_days = _resolve_rolling_window_days(manifest, logical_table_id)
141
+ if eff_days is not None:
142
+ conditions.append(f"datetime({table_prefix}event_at) >= datetime('now', ?)")
143
+ params.append(f"-{max(0, eff_days)} days")
144
+ for item in manifest.filters:
145
+ if item.filter_id == "rolling_window_days":
146
+ continue
147
+ elif item.filter_id == "date_range":
148
+ start = item.params.get("start")
149
+ end = item.params.get("end")
150
+ if start:
151
+ conditions.append(f"datetime({table_prefix}event_at) >= ?")
152
+ params.append(str(start).strip())
153
+ if end:
154
+ conditions.append(f"datetime({table_prefix}event_at) <= ?")
155
+ params.append(str(end).strip())
156
+ elif item.filter_id == "source_filter":
157
+ source_ids = item.params.get("source_ids") or []
158
+ if isinstance(source_ids, list) and source_ids:
159
+ placeholders = ",".join("?" for _ in source_ids)
160
+ conditions.append(f"{table_prefix}source_id IN ({placeholders})")
161
+ params.extend(str(sid) for sid in source_ids)
162
+ if not conditions:
163
+ return "", []
164
+ return " AND " + " AND ".join(conditions), params
165
+
166
+
167
+ def _apply_time_range(
168
+ items: List[Dict[str, Any]],
169
+ field: str,
170
+ start: str,
171
+ end: str,
172
+ inclusive: bool = True,
173
+ ) -> List[Dict[str, Any]]:
174
+ start_dt = _normalize_datetime(start)
175
+ end_dt = _normalize_datetime(end)
176
+ if start_dt is None or end_dt is None:
177
+ return items
178
+ out = []
179
+ for row in items:
180
+ val = row.get(field)
181
+ if val is None:
182
+ continue
183
+ dt = _normalize_datetime(val)
184
+ if dt is None:
185
+ continue
186
+ if inclusive:
187
+ if start_dt <= dt <= end_dt:
188
+ out.append(row)
189
+ else:
190
+ if start_dt < dt < end_dt:
191
+ out.append(row)
192
+ return out
193
+
194
+
195
+ # UMA read path adds these after the DB fetch; they are not physical table columns. Column allowlists
196
+ # are meant to restrict stored fields — keep enrichments when the engine produced them (names still
197
+ # respect contact_display_names + contacts:resolve + sharing_policy upstream).
198
+ _UMA_PRESERVE_THROUGH_COLUMN_ALLOWLIST = frozenset(
199
+ {"sender_display_name", "sender_is_owner", "is_from_self"}
200
+ )
201
+
202
+
203
+ def _apply_field_include(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]:
204
+ if not fields:
205
+ return items
206
+ allowed = set(fields)
207
+ out: List[Dict[str, Any]] = []
208
+ for row in items:
209
+ filtered = {k: v for k, v in row.items() if k in allowed}
210
+ for key in _UMA_PRESERVE_THROUGH_COLUMN_ALLOWLIST:
211
+ val = row.get(key)
212
+ if val is not None and str(val).strip() and key not in filtered:
213
+ filtered[key] = val
214
+ out.append(filtered)
215
+ return out
216
+
217
+
218
+ def _apply_field_exclude(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]:
219
+ if not fields:
220
+ return items
221
+ excluded = set(fields)
222
+ return [{k: v for k, v in row.items() if k not in excluded} for row in items]
223
+
224
+
225
+ def _apply_source(items: List[Dict[str, Any]], source_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
226
+ allowed = set(source_ids or [])
227
+ if not allowed:
228
+ return items
229
+ return [row for row in items if row.get("source_id") in allowed]
230
+
231
+
232
+ def _apply_timestamp_to_date(items: List[Dict[str, Any]], field: str = "event_at") -> List[Dict[str, Any]]:
233
+ out: List[Dict[str, Any]] = []
234
+ for row in items:
235
+ updated = dict(row)
236
+ eff_field = field
237
+ if field == "event_at" and updated.get("event_at") is None and updated.get("ts") is not None:
238
+ eff_field = "ts"
239
+ value = updated.get(eff_field)
240
+ dt = _normalize_datetime(value)
241
+ if dt is not None:
242
+ updated[eff_field] = dt.date().isoformat()
243
+ out.append(updated)
244
+ return out
245
+
246
+
247
+ def _apply_field_transforms(
248
+ items: List[Dict[str, Any]],
249
+ field_transforms: List[FieldTransform],
250
+ table_id: Optional[str] = None,
251
+ diagnostics: Optional[Dict[str, Any]] = None,
252
+ progress_hook: Optional[Callable[[int, int, Optional[str]], None]] = None,
253
+ ) -> List[Dict[str, Any]]:
254
+ """Apply field-level transforms (e.g. timestamp_to_date) per row. Only pure transforms implemented here."""
255
+ if not field_transforms or not items:
256
+ return items
257
+ ollama_ids: Tuple[str, ...] = ()
258
+ apply_ollama: Optional[Callable[..., str]] = None
259
+ ollama_effective = None
260
+ try:
261
+ from topos.config.sanitization_ollama import resolve_sanitization_ollama_effective
262
+ from topos.config.settings import settings as _settings
263
+ from topos.core.state import get_db_connection
264
+ from topos.sanitization.ollama_transforms import (
265
+ OLLAMA_TRANSFORM_IDS as _ollama_ids,
266
+ apply_text_transform_with_ollama as _apply_ollama,
267
+ )
268
+
269
+ ollama_ids = _ollama_ids
270
+ ollama_effective = resolve_sanitization_ollama_effective(_settings, get_db_connection())
271
+ if ollama_effective.enabled:
272
+ apply_ollama = lambda text, tid, p: _apply_ollama(text, tid, p, effective=ollama_effective)
273
+ except ImportError:
274
+ pass
275
+
276
+ out: List[Dict[str, Any]] = []
277
+ stats = diagnostics if isinstance(diagnostics, dict) else None
278
+ if stats is not None:
279
+ stats.setdefault("applied_count", 0)
280
+ stats.setdefault("skipped_count", 0)
281
+ stats.setdefault("skip_reasons", {})
282
+
283
+ def _skip(reason: str) -> None:
284
+ if stats is None:
285
+ return
286
+ stats["skipped_count"] += 1
287
+ reasons = stats["skip_reasons"]
288
+ reasons[reason] = int(reasons.get(reason) or 0) + 1
289
+
290
+ def _applied() -> None:
291
+ if stats is None:
292
+ return
293
+ stats["applied_count"] += 1
294
+ total_rows = len(items)
295
+ total_units = max(1, total_rows * max(1, len(field_transforms)))
296
+ unit_idx = 0
297
+ for idx, row in enumerate(items, start=1):
298
+ updated = dict(row)
299
+ for ft in field_transforms:
300
+ unit_idx += 1
301
+ current_filter = f"{ft.transform_id}({ft.field})"
302
+ if table_id is not None and ft.table_id and ft.table_id != table_id:
303
+ _skip("table_mismatch")
304
+ if progress_hook is not None:
305
+ try:
306
+ progress_hook(unit_idx, total_units, current_filter)
307
+ except TypeError:
308
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
309
+ except Exception:
310
+ pass
311
+ continue
312
+ if ft.field not in updated:
313
+ _skip("field_missing")
314
+ if progress_hook is not None:
315
+ try:
316
+ progress_hook(unit_idx, total_units, current_filter)
317
+ except TypeError:
318
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
319
+ except Exception:
320
+ pass
321
+ continue
322
+ if ft.transform_id == "timestamp_to_date":
323
+ val = updated.get(ft.field)
324
+ dt = _normalize_datetime(val)
325
+ if dt is not None:
326
+ updated[ft.field] = dt.date().isoformat()
327
+ _applied()
328
+ else:
329
+ _skip("value_not_datetime")
330
+ elif apply_ollama is not None and ft.transform_id in ollama_ids:
331
+ val = updated.get(ft.field)
332
+ if isinstance(val, str) and val.strip():
333
+ try:
334
+ updated[ft.field] = apply_ollama(val, ft.transform_id, dict(ft.params or {}))
335
+ _applied()
336
+ except Exception as exc: # noqa: BLE001
337
+ logger.warning(
338
+ "Ollama field transform %s on field %r failed: %s",
339
+ ft.transform_id,
340
+ ft.field,
341
+ exc,
342
+ )
343
+ _skip("handler_error")
344
+ else:
345
+ _skip("empty_value")
346
+ else:
347
+ _skip("transform_unavailable")
348
+ if progress_hook is not None:
349
+ try:
350
+ progress_hook(unit_idx, total_units, current_filter)
351
+ except TypeError:
352
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
353
+ except Exception:
354
+ pass
355
+ out.append(updated)
356
+ return out
357
+
358
+
359
+ def apply_filter_manifest(
360
+ items: List[Dict[str, Any]],
361
+ manifest: Optional[FilterManifest],
362
+ field_transforms: Optional[List[FieldTransform]] = None,
363
+ table_id: Optional[str] = None,
364
+ diagnostics: Optional[Dict[str, Any]] = None,
365
+ progress_hook: Optional[Callable[[int, int, Optional[str]], None]] = None,
366
+ ) -> List[Dict[str, Any]]:
367
+ if not items:
368
+ return items
369
+ filtered = list(items)
370
+ eff_days = _resolve_rolling_window_days(manifest, table_id)
371
+ eff_caps = _resolve_row_caps(manifest, table_id)
372
+ eff_row_cap = min(eff_caps) if eff_caps else None
373
+ if manifest is not None:
374
+ if eff_days is not None:
375
+ window_start = datetime.now(timezone.utc) - timedelta(days=max(0, eff_days))
376
+ filtered = [
377
+ row for row in filtered
378
+ if (dt := _resolve_temporal_datetime(row)) is not None and dt >= window_start
379
+ ]
380
+ for item in manifest.filters:
381
+ if item.filter_id == "rolling_window_days":
382
+ continue
383
+ elif item.filter_id == "date_range":
384
+ time_field = _resolve_time_field_for_rows(filtered)
385
+ filtered = _apply_time_range(
386
+ filtered,
387
+ field=time_field,
388
+ start=str(item.params.get("start") or ""),
389
+ end=str(item.params.get("end") or ""),
390
+ inclusive=True,
391
+ )
392
+ elif item.filter_id in {"max_rows", "most_recent_n"}:
393
+ continue
394
+ elif item.filter_id == "source_filter":
395
+ source_ids = item.params.get("source_ids")
396
+ if not isinstance(source_ids, list):
397
+ raise UMAFilterError("source_filter requires source_ids list")
398
+ filtered = _apply_source(filtered, [str(sid) for sid in source_ids])
399
+ elif item.filter_id == "column_allowlist":
400
+ fields = item.params.get("fields")
401
+ if not isinstance(fields, list):
402
+ raise UMAFilterError("column_allowlist requires fields list")
403
+ filtered = _apply_field_include(filtered, [str(field) for field in fields])
404
+ elif item.filter_id == "column_blocklist":
405
+ fields = item.params.get("fields")
406
+ if not isinstance(fields, list):
407
+ raise UMAFilterError("column_blocklist requires fields list")
408
+ filtered = _apply_field_exclude(filtered, [str(field) for field in fields])
409
+ elif item.filter_id == "timestamp_to_date":
410
+ filtered = _apply_timestamp_to_date(filtered)
411
+ elif item.filter_id in ("contact_display_names", "message_contact_participation"):
412
+ # Stage 11: applied in uma_contact_enrichment before/after this pass when dataset_id is known.
413
+ continue
414
+ elif item.filter_id == "event_contact_participation":
415
+ continue
416
+ else:
417
+ raise UMAFilterError(f"Unsupported manifest filter for this endpoint: {item.filter_id}")
418
+ if eff_row_cap is not None:
419
+ filtered = filtered[: max(0, eff_row_cap)]
420
+ if field_transforms:
421
+ filtered = _apply_field_transforms(
422
+ filtered,
423
+ field_transforms,
424
+ table_id=table_id,
425
+ diagnostics=diagnostics,
426
+ progress_hook=progress_hook,
427
+ )
428
+ return filtered
429
+
430
+
431
+ async def _apply_field_transforms_async(
432
+ items: List[Dict[str, Any]],
433
+ field_transforms: List[FieldTransform],
434
+ table_id: Optional[str] = None,
435
+ diagnostics: Optional[Dict[str, Any]] = None,
436
+ progress_hook: Optional[Callable[[int, int, Optional[str]], None]] = None,
437
+ ) -> List[Dict[str, Any]]:
438
+ """Async variant that keeps event loop responsive during LLM transform calls."""
439
+ if not field_transforms or not items:
440
+ return items
441
+ ollama_ids: Tuple[str, ...] = ()
442
+ apply_ollama_sync: Optional[Callable[..., str]] = None
443
+ ollama_effective = None
444
+ try:
445
+ from topos.config.sanitization_ollama import resolve_sanitization_ollama_effective
446
+ from topos.config.settings import settings as _settings
447
+ from topos.core.state import get_db_connection
448
+ from topos.sanitization.ollama_transforms import (
449
+ OLLAMA_TRANSFORM_IDS as _ollama_ids,
450
+ apply_text_transform_with_ollama as _apply_ollama,
451
+ )
452
+
453
+ ollama_ids = _ollama_ids
454
+ ollama_effective = resolve_sanitization_ollama_effective(_settings, get_db_connection())
455
+ if ollama_effective.enabled:
456
+ apply_ollama_sync = lambda text, tid, p: _apply_ollama(text, tid, p, effective=ollama_effective)
457
+ except ImportError:
458
+ pass
459
+
460
+ out: List[Dict[str, Any]] = []
461
+ stats = diagnostics if isinstance(diagnostics, dict) else None
462
+ if stats is not None:
463
+ stats.setdefault("applied_count", 0)
464
+ stats.setdefault("skipped_count", 0)
465
+ stats.setdefault("skip_reasons", {})
466
+
467
+ def _skip(reason: str) -> None:
468
+ if stats is None:
469
+ return
470
+ stats["skipped_count"] += 1
471
+ reasons = stats["skip_reasons"]
472
+ reasons[reason] = int(reasons.get(reason) or 0) + 1
473
+
474
+ def _applied() -> None:
475
+ if stats is None:
476
+ return
477
+ stats["applied_count"] += 1
478
+
479
+ total_rows = len(items)
480
+ total_units = max(1, total_rows * max(1, len(field_transforms)))
481
+ unit_idx = 0
482
+ for idx, row in enumerate(items, start=1):
483
+ updated = dict(row)
484
+ for ft in field_transforms:
485
+ unit_idx += 1
486
+ current_filter = f"{ft.transform_id}({ft.field})"
487
+ if table_id is not None and ft.table_id and ft.table_id != table_id:
488
+ _skip("table_mismatch")
489
+ if progress_hook is not None:
490
+ try:
491
+ progress_hook(unit_idx, total_units, current_filter)
492
+ except TypeError:
493
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
494
+ except Exception:
495
+ pass
496
+ continue
497
+ if ft.field not in updated:
498
+ _skip("field_missing")
499
+ if progress_hook is not None:
500
+ try:
501
+ progress_hook(unit_idx, total_units, current_filter)
502
+ except TypeError:
503
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
504
+ except Exception:
505
+ pass
506
+ continue
507
+ if ft.transform_id == "timestamp_to_date":
508
+ val = updated.get(ft.field)
509
+ dt = _normalize_datetime(val)
510
+ if dt is not None:
511
+ updated[ft.field] = dt.date().isoformat()
512
+ _applied()
513
+ else:
514
+ _skip("value_not_datetime")
515
+ elif apply_ollama_sync is not None and ft.transform_id in ollama_ids:
516
+ val = updated.get(ft.field)
517
+ if isinstance(val, str) and val.strip():
518
+ try:
519
+ updated[ft.field] = await asyncio.to_thread(
520
+ apply_ollama_sync,
521
+ val,
522
+ ft.transform_id,
523
+ dict(ft.params or {}),
524
+ )
525
+ _applied()
526
+ except Exception as exc: # noqa: BLE001
527
+ logger.warning(
528
+ "Ollama field transform %s on field %r failed: %s",
529
+ ft.transform_id,
530
+ ft.field,
531
+ exc,
532
+ )
533
+ _skip("handler_error")
534
+ else:
535
+ _skip("empty_value")
536
+ else:
537
+ _skip("transform_unavailable")
538
+ if progress_hook is not None:
539
+ try:
540
+ progress_hook(unit_idx, total_units, current_filter)
541
+ except TypeError:
542
+ progress_hook(unit_idx, total_units, None) # type: ignore[misc]
543
+ except Exception:
544
+ pass
545
+ out.append(updated)
546
+ # Cooperative yield so websocket ping/pong can proceed while long transform batches run.
547
+ await asyncio.sleep(0)
548
+ return out
549
+
550
+
551
+ async def apply_filter_manifest_async(
552
+ items: List[Dict[str, Any]],
553
+ manifest: Optional[FilterManifest],
554
+ field_transforms: Optional[List[FieldTransform]] = None,
555
+ table_id: Optional[str] = None,
556
+ diagnostics: Optional[Dict[str, Any]] = None,
557
+ progress_hook: Optional[Callable[[int, int, Optional[str]], None]] = None,
558
+ ) -> List[Dict[str, Any]]:
559
+ """Async counterpart of apply_filter_manifest for long-running transform paths."""
560
+ if not items:
561
+ return items
562
+ filtered = list(items)
563
+ eff_days = _resolve_rolling_window_days(manifest, table_id)
564
+ eff_caps = _resolve_row_caps(manifest, table_id)
565
+ eff_row_cap = min(eff_caps) if eff_caps else None
566
+ if manifest is not None:
567
+ if eff_days is not None:
568
+ window_start = datetime.now(timezone.utc) - timedelta(days=max(0, eff_days))
569
+ filtered = [
570
+ row for row in filtered
571
+ if (dt := _resolve_temporal_datetime(row)) is not None and dt >= window_start
572
+ ]
573
+ for item in manifest.filters:
574
+ if item.filter_id == "rolling_window_days":
575
+ continue
576
+ elif item.filter_id == "date_range":
577
+ time_field = _resolve_time_field_for_rows(filtered)
578
+ filtered = _apply_time_range(
579
+ filtered,
580
+ field=time_field,
581
+ start=str(item.params.get("start") or ""),
582
+ end=str(item.params.get("end") or ""),
583
+ inclusive=True,
584
+ )
585
+ elif item.filter_id in {"max_rows", "most_recent_n"}:
586
+ continue
587
+ elif item.filter_id == "source_filter":
588
+ source_ids = item.params.get("source_ids")
589
+ if not isinstance(source_ids, list):
590
+ raise UMAFilterError("source_filter requires source_ids list")
591
+ filtered = _apply_source(filtered, [str(sid) for sid in source_ids])
592
+ elif item.filter_id == "column_allowlist":
593
+ fields = item.params.get("fields")
594
+ if not isinstance(fields, list):
595
+ raise UMAFilterError("column_allowlist requires fields list")
596
+ filtered = _apply_field_include(filtered, [str(field) for field in fields])
597
+ elif item.filter_id == "column_blocklist":
598
+ fields = item.params.get("fields")
599
+ if not isinstance(fields, list):
600
+ raise UMAFilterError("column_blocklist requires fields list")
601
+ filtered = _apply_field_exclude(filtered, [str(field) for field in fields])
602
+ elif item.filter_id == "timestamp_to_date":
603
+ filtered = _apply_timestamp_to_date(filtered)
604
+ elif item.filter_id in ("contact_display_names", "message_contact_participation"):
605
+ continue
606
+ elif item.filter_id == "event_contact_participation":
607
+ continue
608
+ else:
609
+ raise UMAFilterError(f"Unsupported manifest filter for this endpoint: {item.filter_id}")
610
+ if eff_row_cap is not None:
611
+ filtered = filtered[: max(0, eff_row_cap)]
612
+ if field_transforms:
613
+ filtered = await _apply_field_transforms_async(
614
+ filtered,
615
+ field_transforms,
616
+ table_id=table_id,
617
+ diagnostics=diagnostics,
618
+ progress_hook=progress_hook,
619
+ )
620
+ return filtered
621
+
622
+
623
+ def _apply_single_filter(items: List[Dict[str, Any]], filter_def: Dict[str, Any]) -> List[Dict[str, Any]]:
624
+ filter_type = filter_def.get("type")
625
+ if filter_type == "time_range":
626
+ return _apply_time_range(
627
+ items,
628
+ field=filter_def.get("field") or "event_at",
629
+ start=filter_def.get("start", ""),
630
+ end=filter_def.get("end", ""),
631
+ inclusive=filter_def.get("inclusive", True),
632
+ )
633
+ if filter_type == "field_include":
634
+ return _apply_field_include(items, filter_def.get("fields") or [])
635
+ if filter_type == "field_exclude":
636
+ return _apply_field_exclude(items, filter_def.get("fields") or [])
637
+ if filter_type == "source":
638
+ source_ids = filter_def.get("source_ids")
639
+ source_id = filter_def.get("source_id")
640
+ if source_ids is None and source_id is not None:
641
+ source_ids = [source_id]
642
+ return _apply_source(items, source_ids)
643
+ return items
644
+
645
+
646
+ def apply_filters(
647
+ items: List[Dict[str, Any]],
648
+ filters: Optional[Dict[str, Any]],
649
+ *,
650
+ max_depth: int = 10,
651
+ ) -> List[Dict[str, Any]]:
652
+ _ = max_depth
653
+ if not items or not filters:
654
+ return items
655
+ if not isinstance(filters, dict):
656
+ return items
657
+ try:
658
+ manifest = extract_filter_manifest(filters)
659
+ field_transforms = extract_field_transforms(filters)
660
+ if manifest is not None or field_transforms:
661
+ return apply_filter_manifest(list(items), manifest, field_transforms=field_transforms)
662
+ if "type" not in filters:
663
+ return items
664
+ return _apply_single_filter(list(items), filters)
665
+ except UMAFilterError:
666
+ raise
667
+ except Exception as exc: # noqa: BLE001
668
+ logger.warning("UMA filter application failed: %s", exc)
669
+ return items
@@ -0,0 +1,24 @@
1
+ """Parse UMA ``resource_id`` strings on the engine (no control_plane dependency)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+
8
+ def parse_dataset_id_from_uma_dataset_resource_id(resource_id: Optional[str]) -> Optional[str]:
9
+ """
10
+ For ``dataset:{owner}:{dataset_segments...}:{device_id}``, return the dataset key used in SQLite.
11
+
12
+ ``dataset_id`` may contain colons (e.g. ``{uuid}:default``), so it is everything after the owner
13
+ and before the final segment (device_id). This must match ``control_plane.uma.models.resource_id_parse``.
14
+ """
15
+ rid = (resource_id or "").strip()
16
+ if not rid:
17
+ return None
18
+ parts = rid.split(":")
19
+ if len(parts) < 4:
20
+ return None
21
+ if (parts[0] or "").lower() != "dataset":
22
+ return None
23
+ inner = ":".join(parts[2:-1])
24
+ return inner.strip() or None