topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
shared/filtering.py ADDED
@@ -0,0 +1,640 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from typing import Any, Dict, Iterable, List, Literal, Optional
7
+
8
+ from pydantic import BaseModel, Field, field_validator, model_validator
9
+
10
+
11
+ class FilterCategory(str, Enum):
12
+ RETRIEVAL = "retrieval"
13
+ AGGREGATION = "aggregation"
14
+ FIELD_LEVEL = "field_level"
15
+ SANITIZATION = "sanitization"
16
+ INFERABILITY_REDUCTION = "inferability_reduction"
17
+
18
+
19
+ class FilterRuntimeStatus(str, Enum):
20
+ SUPPORTED_NOW = "supported_now"
21
+ STAGE_8_TARGET = "stage_8_target"
22
+ FUTURE_ONLY = "future_only"
23
+
24
+
25
+ # Relative compute / latency band for UI grouping (curated per filter; not measured SLA).
26
+ FilterComputeTier = Literal["low", "medium", "high"]
27
+ VALID_FILTER_COMPUTE_TIERS = frozenset({"low", "medium", "high"})
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class FilterParamSpec:
32
+ name: str
33
+ value_type: str
34
+ required: bool = False
35
+ min_value: Optional[float] = None
36
+ allowed_values: Optional[List[str]] = None
37
+ item_type: Optional[str] = None
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class FilterDefinition:
42
+ filter_id: str
43
+ version: int
44
+ category: FilterCategory
45
+ display_name: str
46
+ description: str
47
+ parameter_schema: List[FilterParamSpec] = field(default_factory=list)
48
+ compatibility_rules: Dict[str, Any] = field(default_factory=dict)
49
+ ui_metadata: Dict[str, Any] = field(default_factory=dict)
50
+ third_party_default_allowed: bool = True
51
+ runtime_status: FilterRuntimeStatus = FilterRuntimeStatus.FUTURE_ONLY
52
+ handler_id: Optional[str] = None
53
+
54
+
55
+ def _catalog_entry(
56
+ filter_id: str,
57
+ category: FilterCategory,
58
+ description: str,
59
+ *,
60
+ parameter_schema: Optional[List[FilterParamSpec]] = None,
61
+ runtime_status: FilterRuntimeStatus = FilterRuntimeStatus.FUTURE_ONLY,
62
+ handler_id: Optional[str] = None,
63
+ ui_group: str = "Advanced",
64
+ compute_tier: FilterComputeTier = "low",
65
+ ) -> FilterDefinition:
66
+ return FilterDefinition(
67
+ filter_id=filter_id,
68
+ version=1,
69
+ category=category,
70
+ display_name=filter_id.replace("_", " ").title(),
71
+ description=description,
72
+ parameter_schema=parameter_schema or [],
73
+ compatibility_rules={},
74
+ ui_metadata={"group": ui_group, "compute_tier": compute_tier},
75
+ third_party_default_allowed=True,
76
+ runtime_status=runtime_status,
77
+ handler_id=handler_id,
78
+ )
79
+
80
+
81
+ FILTER_CATALOG: Dict[str, FilterDefinition] = {
82
+ "rolling_window_days": _catalog_entry(
83
+ "rolling_window_days",
84
+ FilterCategory.RETRIEVAL,
85
+ "Restrict records to the last N days.",
86
+ parameter_schema=[
87
+ FilterParamSpec("days", "int", required=True, min_value=0),
88
+ FilterParamSpec("table_id", "str", required=False),
89
+ ],
90
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
91
+ handler_id="retrieval.rolling_window_days",
92
+ ui_group="Time",
93
+ ),
94
+ "date_range": _catalog_entry(
95
+ "date_range",
96
+ FilterCategory.RETRIEVAL,
97
+ "Restrict records to an explicit start/end range.",
98
+ parameter_schema=[
99
+ FilterParamSpec("start", "iso_datetime", required=True),
100
+ FilterParamSpec("end", "iso_datetime", required=True),
101
+ ],
102
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
103
+ handler_id="retrieval.date_range",
104
+ ui_group="Time",
105
+ ),
106
+ "max_rows": _catalog_entry(
107
+ "max_rows",
108
+ FilterCategory.RETRIEVAL,
109
+ "Cap the number of returned rows.",
110
+ parameter_schema=[
111
+ FilterParamSpec("count", "int", required=True, min_value=0),
112
+ FilterParamSpec("table_id", "str", required=False),
113
+ ],
114
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
115
+ handler_id="retrieval.max_rows",
116
+ ui_group="Detail",
117
+ ),
118
+ "most_recent_n": _catalog_entry(
119
+ "most_recent_n",
120
+ FilterCategory.RETRIEVAL,
121
+ "Return only the N most recent records.",
122
+ parameter_schema=[
123
+ FilterParamSpec("count", "int", required=True, min_value=0),
124
+ FilterParamSpec("table_id", "str", required=False),
125
+ ],
126
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
127
+ handler_id="retrieval.most_recent_n",
128
+ ui_group="Detail",
129
+ ),
130
+ "source_filter": _catalog_entry(
131
+ "source_filter",
132
+ FilterCategory.RETRIEVAL,
133
+ "Limit results to a set of source_ids.",
134
+ parameter_schema=[FilterParamSpec("source_ids", "list", required=True, item_type="str")],
135
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
136
+ handler_id="retrieval.source_filter",
137
+ ui_group="Sources",
138
+ ),
139
+ "column_allowlist": _catalog_entry(
140
+ "column_allowlist",
141
+ FilterCategory.FIELD_LEVEL,
142
+ "Only include the specified fields.",
143
+ parameter_schema=[FilterParamSpec("fields", "list", required=True, item_type="str")],
144
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
145
+ handler_id="field_level.column_allowlist",
146
+ ui_group="Detail",
147
+ ),
148
+ "column_blocklist": _catalog_entry(
149
+ "column_blocklist",
150
+ FilterCategory.FIELD_LEVEL,
151
+ "Exclude the specified fields.",
152
+ parameter_schema=[FilterParamSpec("fields", "list", required=True, item_type="str")],
153
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
154
+ handler_id="field_level.column_blocklist",
155
+ ui_group="Detail",
156
+ ),
157
+ "daily_rollup": _catalog_entry(
158
+ "daily_rollup",
159
+ FilterCategory.AGGREGATION,
160
+ "Aggregate records by day.",
161
+ runtime_status=FilterRuntimeStatus.STAGE_8_TARGET,
162
+ handler_id="aggregation.daily_rollup",
163
+ ui_group="Summaries",
164
+ ),
165
+ "weekly_rollup": _catalog_entry(
166
+ "weekly_rollup",
167
+ FilterCategory.AGGREGATION,
168
+ "Aggregate records by week.",
169
+ runtime_status=FilterRuntimeStatus.STAGE_8_TARGET,
170
+ handler_id="aggregation.weekly_rollup",
171
+ ui_group="Summaries",
172
+ ),
173
+ "count_only": _catalog_entry(
174
+ "count_only",
175
+ FilterCategory.AGGREGATION,
176
+ "Return only record counts.",
177
+ runtime_status=FilterRuntimeStatus.STAGE_8_TARGET,
178
+ handler_id="aggregation.count_only",
179
+ ui_group="Summaries",
180
+ ),
181
+ "timestamp_to_date": _catalog_entry(
182
+ "timestamp_to_date",
183
+ FilterCategory.SANITIZATION,
184
+ "Reduce timestamps to date precision.",
185
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
186
+ handler_id="sanitization.timestamp_to_date",
187
+ ui_group="Redaction",
188
+ compute_tier="low",
189
+ ),
190
+ "raw_to_summary": _catalog_entry(
191
+ "raw_to_summary",
192
+ FilterCategory.SANITIZATION,
193
+ "Transform raw text into a summary.",
194
+ parameter_schema=[
195
+ FilterParamSpec("style", "str", required=False),
196
+ FilterParamSpec("max_length", "int", required=False, min_value=1),
197
+ ],
198
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
199
+ handler_id="sanitization.raw_to_summary",
200
+ ui_group="Redaction",
201
+ compute_tier="high",
202
+ ),
203
+ "raw_to_sentiment": _catalog_entry(
204
+ "raw_to_sentiment",
205
+ FilterCategory.SANITIZATION,
206
+ "Transform raw text into sentiment output.",
207
+ parameter_schema=[
208
+ FilterParamSpec("scale", "str", required=False),
209
+ FilterParamSpec("labels", "list", required=False, item_type="str"),
210
+ ],
211
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
212
+ handler_id="sanitization.raw_to_sentiment",
213
+ ui_group="Redaction",
214
+ compute_tier="medium",
215
+ ),
216
+ "third_party_anonymization": _catalog_entry(
217
+ "third_party_anonymization",
218
+ FilterCategory.SANITIZATION,
219
+ "Redact third-party identities from content.",
220
+ parameter_schema=[FilterParamSpec("mode", "str", required=False)],
221
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
222
+ handler_id="sanitization.third_party_anonymization",
223
+ ui_group="Redaction",
224
+ compute_tier="medium",
225
+ ),
226
+ "name_removal": _catalog_entry(
227
+ "name_removal",
228
+ FilterCategory.SANITIZATION,
229
+ "Remove names from content.",
230
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
231
+ handler_id="sanitization.name_removal",
232
+ ui_group="Redaction",
233
+ compute_tier="medium",
234
+ ),
235
+ "contact_removal": _catalog_entry(
236
+ "contact_removal",
237
+ FilterCategory.SANITIZATION,
238
+ "Remove contact details from content.",
239
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
240
+ handler_id="sanitization.contact_removal",
241
+ ui_group="Redaction",
242
+ compute_tier="medium",
243
+ ),
244
+ "pii_redaction": _catalog_entry(
245
+ "pii_redaction",
246
+ FilterCategory.SANITIZATION,
247
+ "Redact PII (names, contact details, etc.) from content using NER and replacement.",
248
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
249
+ handler_id="sanitization.pii_redaction",
250
+ ui_group="Redaction",
251
+ compute_tier="medium",
252
+ ),
253
+ "nsfw_sanitization": _catalog_entry(
254
+ "nsfw_sanitization",
255
+ FilterCategory.SANITIZATION,
256
+ "Mask or redact NSFW content in text.",
257
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
258
+ handler_id="sanitization.nsfw_sanitization",
259
+ ui_group="Redaction",
260
+ compute_tier="high",
261
+ ),
262
+ "coords_to_city": _catalog_entry(
263
+ "coords_to_city",
264
+ FilterCategory.INFERABILITY_REDUCTION,
265
+ "Reduce coordinate precision to city-level.",
266
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
267
+ handler_id="inferability.coords_to_city",
268
+ ui_group="Redaction",
269
+ compute_tier="medium",
270
+ ),
271
+ "amount_to_range": _catalog_entry(
272
+ "amount_to_range",
273
+ FilterCategory.INFERABILITY_REDUCTION,
274
+ "Reduce numeric amounts to bands.",
275
+ parameter_schema=[FilterParamSpec("bands", "list", required=False, item_type="str")],
276
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
277
+ handler_id="inferability.amount_to_range",
278
+ ui_group="Redaction",
279
+ compute_tier="low",
280
+ ),
281
+ "behavior_anonymization": _catalog_entry(
282
+ "behavior_anonymization",
283
+ FilterCategory.INFERABILITY_REDUCTION,
284
+ "Break direct identity-behavior linkage.",
285
+ parameter_schema=[FilterParamSpec("mode", "str", required=False)],
286
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
287
+ handler_id="inferability.behavior_anonymization",
288
+ ui_group="Redaction",
289
+ compute_tier="medium",
290
+ ),
291
+ "precision_level": _catalog_entry(
292
+ "precision_level",
293
+ FilterCategory.INFERABILITY_REDUCTION,
294
+ "Coarsen exposed precision level.",
295
+ parameter_schema=[FilterParamSpec("level", "enum", required=True, allowed_values=["exact", "city", "region", "band"])],
296
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
297
+ handler_id="inferability.precision_level",
298
+ ui_group="Redaction",
299
+ compute_tier="high",
300
+ ),
301
+ # --- Stage 11: contact resolution & participation (see sprints_roles_scopes_stage_11) ---
302
+ "contact_display_names": _catalog_entry(
303
+ "contact_display_names",
304
+ FilterCategory.FIELD_LEVEL,
305
+ "Resolve sender display names from the contact book when contacts:resolve is granted.",
306
+ parameter_schema=[FilterParamSpec("enabled", "bool", required=True)],
307
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
308
+ handler_id="field_level.contact_display_names",
309
+ ui_group="Contacts",
310
+ ),
311
+ "message_contact_participation": _catalog_entry(
312
+ "message_contact_participation",
313
+ FilterCategory.RETRIEVAL,
314
+ "Include or exclude messages by resolved contact_id for the sender.",
315
+ parameter_schema=[
316
+ FilterParamSpec("mode", "enum", required=True, allowed_values=["all", "allowlist", "blocklist"]),
317
+ FilterParamSpec("contact_ids", "list", required=True, item_type="str"),
318
+ FilterParamSpec("match", "enum", required=True, allowed_values=["sender_only", "thread_participants"]),
319
+ ],
320
+ runtime_status=FilterRuntimeStatus.SUPPORTED_NOW,
321
+ handler_id="retrieval.message_contact_participation",
322
+ ui_group="Contacts",
323
+ ),
324
+ "event_contact_participation": _catalog_entry(
325
+ "event_contact_participation",
326
+ FilterCategory.RETRIEVAL,
327
+ "Phase 2: include or exclude events by linked contacts (organizer/attendee).",
328
+ parameter_schema=[
329
+ FilterParamSpec("mode", "enum", required=True, allowed_values=["all", "allowlist", "blocklist"]),
330
+ FilterParamSpec("contact_ids", "list", required=True, item_type="str"),
331
+ FilterParamSpec("match", "enum", required=True, allowed_values=["organizer", "attendee", "any_linked"]),
332
+ ],
333
+ runtime_status=FilterRuntimeStatus.FUTURE_ONLY,
334
+ handler_id="retrieval.event_contact_participation",
335
+ ui_group="Contacts",
336
+ ),
337
+ }
338
+
339
+
340
+ def list_filter_definitions() -> List[FilterDefinition]:
341
+ return list(FILTER_CATALOG.values())
342
+
343
+
344
+ def get_filter_definition(filter_id: str) -> Optional[FilterDefinition]:
345
+ return FILTER_CATALOG.get((filter_id or "").strip())
346
+
347
+
348
+ def _validate_iso_datetime(value: str) -> None:
349
+ datetime.fromisoformat(value.replace("Z", "+00:00"))
350
+
351
+
352
+ def _validate_param(spec: FilterParamSpec, value: Any) -> None:
353
+ if spec.value_type == "bool":
354
+ if not isinstance(value, bool):
355
+ raise ValueError(f"Parameter {spec.name!r} must be a bool")
356
+ return
357
+ if spec.value_type == "int":
358
+ if not isinstance(value, int) or isinstance(value, bool):
359
+ raise ValueError(f"Parameter {spec.name!r} must be an int")
360
+ if spec.min_value is not None and value < spec.min_value:
361
+ raise ValueError(f"Parameter {spec.name!r} must be >= {spec.min_value}")
362
+ return
363
+ if spec.value_type == "str":
364
+ if not isinstance(value, str) or not value.strip():
365
+ raise ValueError(f"Parameter {spec.name!r} must be a non-empty string")
366
+ return
367
+ if spec.value_type == "iso_datetime":
368
+ if not isinstance(value, str):
369
+ raise ValueError(f"Parameter {spec.name!r} must be an ISO datetime string")
370
+ _validate_iso_datetime(value)
371
+ return
372
+ if spec.value_type == "enum":
373
+ if value not in (spec.allowed_values or []):
374
+ raise ValueError(f"Parameter {spec.name!r} must be one of {spec.allowed_values}")
375
+ return
376
+ if spec.value_type == "list":
377
+ if not isinstance(value, list):
378
+ raise ValueError(f"Parameter {spec.name!r} must be a list")
379
+ if spec.item_type == "str":
380
+ if any(not isinstance(item, str) or not item.strip() for item in value):
381
+ raise ValueError(f"Parameter {spec.name!r} must contain only non-empty strings")
382
+ return
383
+ raise ValueError(f"Unsupported parameter type {spec.value_type!r} for {spec.name!r}")
384
+
385
+
386
+ def validate_filter_params(filter_id: str, params: Dict[str, Any], *, allow_future: bool = False) -> None:
387
+ definition = get_filter_definition(filter_id)
388
+ if not definition:
389
+ raise ValueError(f"Unknown filter_id: {filter_id}")
390
+ if definition.runtime_status == FilterRuntimeStatus.FUTURE_ONLY and not allow_future:
391
+ raise ValueError(f"Filter {filter_id!r} is not supported for runtime manifests yet")
392
+ expected = {spec.name: spec for spec in definition.parameter_schema}
393
+ for spec in definition.parameter_schema:
394
+ if spec.required and spec.name not in params:
395
+ raise ValueError(f"Missing required parameter {spec.name!r} for filter {filter_id!r}")
396
+ for key, value in params.items():
397
+ if key not in expected:
398
+ raise ValueError(f"Unexpected parameter {key!r} for filter {filter_id!r}")
399
+ _validate_param(expected[key], value)
400
+
401
+
402
+ class FilterInstance(BaseModel):
403
+ filter_id: str = Field(..., min_length=1)
404
+ category: Optional[FilterCategory] = Field(None)
405
+ params: Dict[str, Any] = Field(default_factory=dict)
406
+
407
+ @field_validator("filter_id")
408
+ @classmethod
409
+ def known_filter_id(cls, value: str) -> str:
410
+ filter_id = (value or "").strip()
411
+ if not get_filter_definition(filter_id):
412
+ raise ValueError(f"Unknown filter_id: {filter_id}")
413
+ return filter_id
414
+
415
+ @model_validator(mode="after")
416
+ def validate_against_catalog(self) -> "FilterInstance":
417
+ definition = get_filter_definition(self.filter_id)
418
+ if definition is None:
419
+ raise ValueError(f"Unknown filter_id: {self.filter_id}")
420
+ if self.category is None:
421
+ self.category = definition.category
422
+ elif self.category != definition.category:
423
+ raise ValueError(
424
+ f"Filter {self.filter_id!r} must use category {definition.category.value!r}, "
425
+ f"got {self.category.value!r}"
426
+ )
427
+ validate_filter_params(self.filter_id, self.params)
428
+ return self
429
+
430
+ def to_storage_dict(self) -> Dict[str, Any]:
431
+ return self.model_dump(exclude_none=True, mode="json")
432
+
433
+
434
+ class FieldTransform(BaseModel):
435
+ """
436
+ Field-level transform: apply a single transform (by transform_id) to a specific table/field.
437
+ Used in permission payload as field_transforms list; registerer field_transform_defaults
438
+ use the same transform_ids from FILTER_CATALOG.
439
+ """
440
+
441
+ table_id: Optional[str] = Field(None, description="Canonical table; optional if scope implies one table")
442
+ field: str = Field(..., min_length=1, description="Column/field name to transform")
443
+ transform_id: str = Field(..., min_length=1, description="Filter/transform ID from FILTER_CATALOG")
444
+ params: Dict[str, Any] = Field(default_factory=dict, description="Transform parameters")
445
+
446
+ @field_validator("transform_id")
447
+ @classmethod
448
+ def known_transform_id(cls, value: str) -> str:
449
+ transform_id = (value or "").strip()
450
+ if not get_filter_definition(transform_id):
451
+ raise ValueError(f"Unknown transform_id: {transform_id}")
452
+ return transform_id
453
+
454
+ @model_validator(mode="after")
455
+ def validate_params_against_catalog(self) -> "FieldTransform":
456
+ definition = get_filter_definition(self.transform_id)
457
+ if definition is not None:
458
+ validate_filter_params(self.transform_id, self.params, allow_future=True)
459
+ return self
460
+
461
+ def to_storage_dict(self) -> Dict[str, Any]:
462
+ return self.model_dump(exclude_none=True, mode="json")
463
+
464
+
465
+ def validate_field_transforms(field_transforms: List[Any]) -> None:
466
+ """
467
+ Validate a list of field_transforms: each item must have field and transform_id,
468
+ and transform_id must be in FILTER_CATALOG. Raises ValueError on first invalid entry.
469
+ """
470
+ if not isinstance(field_transforms, list):
471
+ raise ValueError("field_transforms must be a list")
472
+ for i, item in enumerate(field_transforms):
473
+ if isinstance(item, FieldTransform):
474
+ continue
475
+ if isinstance(item, dict):
476
+ FieldTransform.model_validate(item)
477
+ continue
478
+ raise ValueError(f"field_transforms[{i}] must be a FieldTransform or dict, got {type(item).__name__}")
479
+
480
+
481
+ def field_transforms_from_storage(value: Any) -> Optional[List[FieldTransform]]:
482
+ """Parse field_transforms from storage (list of dicts). Returns None for None or missing; empty list for []."""
483
+ if value is None:
484
+ return None
485
+ if isinstance(value, list):
486
+ if not value:
487
+ return []
488
+ return [FieldTransform.model_validate(item) for item in value]
489
+ raise ValueError("field_transforms must be a list or None")
490
+
491
+
492
+ class FilterManifestProvenance(BaseModel):
493
+ resource_defaults_applied: bool = False
494
+ role_defaults_applied: List[str] = Field(default_factory=list)
495
+ source_defaults_applied: List[str] = Field(default_factory=list)
496
+ owner_overrides: List[str] = Field(default_factory=list)
497
+
498
+
499
+ class FilterManifest(BaseModel):
500
+ manifest_version: int = Field(1, ge=1)
501
+ filters: List[FilterInstance] = Field(default_factory=list)
502
+ provenance: Optional[FilterManifestProvenance] = None
503
+
504
+ def to_storage_dict(self) -> Dict[str, Any]:
505
+ return self.model_dump(exclude_none=True, mode="json")
506
+
507
+ def get_filter(self, filter_id: str) -> Optional[FilterInstance]:
508
+ for item in self.filters:
509
+ if item.filter_id == filter_id:
510
+ return item
511
+ return None
512
+
513
+ def iter_filters(self, categories: Optional[Iterable[FilterCategory]] = None) -> List[FilterInstance]:
514
+ if categories is None:
515
+ return list(self.filters)
516
+ allowed = set(categories)
517
+ return [item for item in self.filters if item.category in allowed]
518
+
519
+
520
+ def filter_manifest_from_storage(value: Any) -> Optional[FilterManifest]:
521
+ if value is None:
522
+ return None
523
+ if isinstance(value, FilterManifest):
524
+ return value
525
+ if not isinstance(value, dict):
526
+ raise ValueError("filter_manifest must be a dict")
527
+ return FilterManifest.model_validate(value)
528
+
529
+
530
+ def build_filter_manifest(*filters: FilterInstance, provenance: Optional[FilterManifestProvenance] = None) -> FilterManifest:
531
+ return FilterManifest(filters=list(filters), provenance=provenance)
532
+
533
+
534
+ def _merge_param_values(filter_id: str, existing: Dict[str, Any], incoming: Dict[str, Any]) -> Dict[str, Any]:
535
+ merged = dict(existing)
536
+ if filter_id == "rolling_window_days":
537
+ current = merged.get("days")
538
+ incoming_days = incoming.get("days")
539
+ if current is None:
540
+ merged["days"] = incoming_days
541
+ elif incoming_days is not None:
542
+ merged["days"] = min(int(current), int(incoming_days))
543
+ return merged
544
+ if filter_id in {"max_rows", "most_recent_n"}:
545
+ current = merged.get("count")
546
+ incoming_count = incoming.get("count")
547
+ if current is None:
548
+ merged["count"] = incoming_count
549
+ elif incoming_count is not None:
550
+ merged["count"] = min(int(current), int(incoming_count))
551
+ return merged
552
+ if filter_id == "date_range":
553
+ start = merged.get("start")
554
+ end = merged.get("end")
555
+ incoming_start = incoming.get("start")
556
+ incoming_end = incoming.get("end")
557
+ if incoming_start is not None:
558
+ if start is None or str(incoming_start) > str(start):
559
+ merged["start"] = incoming_start
560
+ if incoming_end is not None:
561
+ if end is None or str(incoming_end) < str(end):
562
+ merged["end"] = incoming_end
563
+ return merged
564
+ if filter_id == "source_filter":
565
+ current = set(str(item) for item in merged.get("source_ids", []))
566
+ incoming_values = set(str(item) for item in incoming.get("source_ids", []))
567
+ if current and incoming_values:
568
+ merged["source_ids"] = sorted(current & incoming_values)
569
+ elif incoming_values:
570
+ merged["source_ids"] = sorted(incoming_values)
571
+ return merged
572
+ if filter_id == "column_allowlist":
573
+ current = set(str(item) for item in merged.get("fields", []))
574
+ incoming_values = set(str(item) for item in incoming.get("fields", []))
575
+ if current and incoming_values:
576
+ merged["fields"] = sorted(current & incoming_values)
577
+ elif incoming_values:
578
+ merged["fields"] = sorted(incoming_values)
579
+ return merged
580
+ if filter_id == "column_blocklist":
581
+ current = set(str(item) for item in merged.get("fields", []))
582
+ incoming_values = set(str(item) for item in incoming.get("fields", []))
583
+ merged["fields"] = sorted(current | incoming_values)
584
+ return merged
585
+ if filter_id == "contact_display_names":
586
+ # Stricter: both must allow names.
587
+ merged["enabled"] = bool(merged.get("enabled", True)) and bool(incoming.get("enabled", True))
588
+ return merged
589
+ if filter_id == "message_contact_participation":
590
+ em = str(merged.get("mode") or "all")
591
+ im = str(incoming.get("mode") or "all")
592
+ ec = {str(x) for x in (merged.get("contact_ids") or [])}
593
+ ic = {str(x) for x in (incoming.get("contact_ids") or [])}
594
+ match = str(incoming.get("match") or merged.get("match") or "sender_only")
595
+ if "blocklist" in (em, im):
596
+ return {"mode": "blocklist", "contact_ids": sorted(ec | ic), "match": match}
597
+ if em == "allowlist" and im == "allowlist":
598
+ inter = ec & ic if ec and ic else ec | ic
599
+ return {"mode": "allowlist", "contact_ids": sorted(inter), "match": match}
600
+ if im != "all":
601
+ return {"mode": im, "contact_ids": sorted(ic), "match": match}
602
+ if em != "all":
603
+ return {"mode": em, "contact_ids": sorted(ec), "match": match}
604
+ return {"mode": "all", "contact_ids": [], "match": match}
605
+ return incoming if incoming else merged
606
+
607
+
608
+ def _manifest_merge_key(item: FilterInstance) -> str:
609
+ """Merge key: retrieval caps may repeat per logical table via params.table_id."""
610
+ fid = item.filter_id
611
+ if fid in {"rolling_window_days", "max_rows", "most_recent_n"}:
612
+ tid = str(item.params.get("table_id") or "").strip()
613
+ return f"{fid}\x00{tid}"
614
+ return fid
615
+
616
+
617
+ def merge_filter_manifests(
618
+ manifests: Iterable[Optional[FilterManifest]],
619
+ *,
620
+ provenance: Optional[FilterManifestProvenance] = None,
621
+ ) -> FilterManifest:
622
+ merged_instances: Dict[str, FilterInstance] = {}
623
+ for manifest in manifests:
624
+ if manifest is None:
625
+ continue
626
+ for item in manifest.filters:
627
+ key = _manifest_merge_key(item)
628
+ existing = merged_instances.get(key)
629
+ if existing is None:
630
+ merged_instances[key] = item
631
+ continue
632
+ merged_instances[key] = FilterInstance(
633
+ filter_id=item.filter_id,
634
+ category=item.category,
635
+ params=_merge_param_values(item.filter_id, existing.params, item.params),
636
+ )
637
+ return FilterManifest(
638
+ filters=sorted(merged_instances.values(), key=lambda item: (_manifest_merge_key(item), item.filter_id)),
639
+ provenance=provenance,
640
+ )