truthound-dashboard 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.3.dist-info/METADATA +0 -505
  203. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,653 @@
1
+ """Enterprise Sampling Schemas.
2
+
3
+ This module provides Pydantic models for truthound 1.2.10's enterprise-scale
4
+ sampling capabilities, supporting 100M+ row datasets with:
5
+ - Block Sampling
6
+ - Multi-Stage Sampling
7
+ - Column-Aware Sampling
8
+ - Progressive Sampling
9
+ - Probabilistic Data Structures (HyperLogLog, Count-Min Sketch, Bloom Filter)
10
+
11
+ Architecture follows the Strategy pattern for extensibility.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from datetime import datetime
17
+ from enum import Enum
18
+ from typing import Any
19
+
20
+ from pydantic import BaseModel, ConfigDict, Field
21
+
22
+
23
+ # ============================================================================
24
+ # Enums
25
+ # ============================================================================
26
+
27
+
28
+ class ScaleCategory(str, Enum):
29
+ """Dataset scale categories for automatic strategy selection."""
30
+
31
+ SMALL = "small" # < 1M rows - no sampling needed
32
+ MEDIUM = "medium" # 1M - 10M rows - column-aware sampling
33
+ LARGE = "large" # 10M - 100M rows - block sampling
34
+ XLARGE = "xlarge" # 100M - 1B rows - multi-stage sampling
35
+ XXLARGE = "xxlarge" # > 1B rows - sketches + multi-stage
36
+
37
+
38
+ class EnterpriseSamplingStrategy(str, Enum):
39
+ """Enterprise-scale sampling strategies from truthound 1.2.10."""
40
+
41
+ # Basic strategies (already supported)
42
+ NONE = "none"
43
+ RANDOM = "random"
44
+ HEAD = "head"
45
+ TAIL = "tail"
46
+ STRATIFIED = "stratified"
47
+ RESERVOIR = "reservoir"
48
+ SYSTEMATIC = "systematic"
49
+ ADAPTIVE = "adaptive"
50
+ HASH = "hash"
51
+
52
+ # Enterprise strategies (new in 1.2.10)
53
+ BLOCK = "block" # Block-based parallel sampling
54
+ MULTI_STAGE = "multi_stage" # Hierarchical multi-stage sampling
55
+ COLUMN_AWARE = "column_aware" # Type-weighted adaptive sampling
56
+ PROGRESSIVE = "progressive" # Convergence-based iterative sampling
57
+ PARALLEL_BLOCK = "parallel_block" # Multi-threaded block sampling
58
+
59
+
60
+ class SamplingQuality(str, Enum):
61
+ """Sampling quality presets."""
62
+
63
+ SKETCH = "sketch" # Fast approximation, 10K samples
64
+ QUICK = "quick" # 90% confidence, 50K samples
65
+ STANDARD = "standard" # 95% confidence, 100K samples (default)
66
+ HIGH = "high" # 99% confidence, 500K samples
67
+ EXACT = "exact" # Full scan, 100% accuracy
68
+
69
+
70
+ class SketchType(str, Enum):
71
+ """Probabilistic data structure types."""
72
+
73
+ HYPERLOGLOG = "hyperloglog" # Cardinality estimation
74
+ COUNTMIN = "countmin" # Frequency estimation
75
+ BLOOM = "bloom" # Membership testing
76
+
77
+
78
+ class SchedulingPolicy(str, Enum):
79
+ """Parallel execution scheduling policies."""
80
+
81
+ ROUND_ROBIN = "round_robin"
82
+ WORK_STEALING = "work_stealing"
83
+ ADAPTIVE = "adaptive"
84
+
85
+
86
+ # ============================================================================
87
+ # Configuration Models
88
+ # ============================================================================
89
+
90
+
91
+ class MemoryBudgetConfig(BaseModel):
92
+ """Memory budget configuration for enterprise sampling."""
93
+
94
+ model_config = ConfigDict(extra="forbid")
95
+
96
+ max_memory_mb: int = Field(
97
+ default=1024,
98
+ ge=128,
99
+ le=65536,
100
+ description="Maximum memory in MB",
101
+ )
102
+ reserved_memory_mb: int = Field(
103
+ default=256,
104
+ ge=64,
105
+ le=8192,
106
+ description="Reserved memory for system operations",
107
+ )
108
+ gc_threshold_mb: int | None = Field(
109
+ default=None,
110
+ description="GC trigger threshold (default: 75% of max)",
111
+ )
112
+ backpressure_enabled: bool = Field(
113
+ default=True,
114
+ description="Enable memory backpressure",
115
+ )
116
+
117
+
118
+ class ParallelSamplingConfig(BaseModel):
119
+ """Parallel block sampling configuration."""
120
+
121
+ model_config = ConfigDict(extra="forbid")
122
+
123
+ max_workers: int = Field(
124
+ default=4,
125
+ ge=1,
126
+ le=32,
127
+ description="Maximum parallel workers (0 = auto)",
128
+ )
129
+ enable_work_stealing: bool = Field(
130
+ default=True,
131
+ description="Enable work stealing for load balancing",
132
+ )
133
+ scheduling_policy: SchedulingPolicy = Field(
134
+ default=SchedulingPolicy.ADAPTIVE,
135
+ description="Task scheduling policy",
136
+ )
137
+ backpressure_threshold: float = Field(
138
+ default=0.75,
139
+ ge=0.5,
140
+ le=0.95,
141
+ description="Memory threshold for backpressure (0.0-1.0)",
142
+ )
143
+ chunk_timeout_seconds: float = Field(
144
+ default=30.0,
145
+ ge=1.0,
146
+ le=3600.0,
147
+ description="Timeout per block in seconds",
148
+ )
149
+
150
+
151
+ class BlockSamplingConfig(BaseModel):
152
+ """Block sampling specific configuration."""
153
+
154
+ model_config = ConfigDict(extra="forbid")
155
+
156
+ block_size: int = Field(
157
+ default=0,
158
+ ge=0,
159
+ description="Rows per block (0 = auto-detect)",
160
+ )
161
+ sample_per_block: int | None = Field(
162
+ default=None,
163
+ description="Samples per block (None = proportional)",
164
+ )
165
+ parallel: ParallelSamplingConfig = Field(
166
+ default_factory=ParallelSamplingConfig,
167
+ description="Parallel processing configuration",
168
+ )
169
+
170
+
171
+ class MultiStageSamplingConfig(BaseModel):
172
+ """Multi-stage hierarchical sampling configuration."""
173
+
174
+ model_config = ConfigDict(extra="forbid")
175
+
176
+ num_stages: int = Field(
177
+ default=3,
178
+ ge=2,
179
+ le=5,
180
+ description="Number of sampling stages",
181
+ )
182
+ stage_reduction_factor: float | None = Field(
183
+ default=None,
184
+ description="Reduction factor per stage (None = auto)",
185
+ )
186
+ early_stop_enabled: bool = Field(
187
+ default=True,
188
+ description="Enable early stopping on convergence",
189
+ )
190
+
191
+
192
+ class ColumnAwareSamplingConfig(BaseModel):
193
+ """Column-aware adaptive sampling configuration."""
194
+
195
+ model_config = ConfigDict(extra="forbid")
196
+
197
+ string_multiplier: float = Field(
198
+ default=2.0,
199
+ ge=1.0,
200
+ le=5.0,
201
+ description="Sample multiplier for string columns",
202
+ )
203
+ categorical_multiplier: float = Field(
204
+ default=0.5,
205
+ ge=0.1,
206
+ le=2.0,
207
+ description="Sample multiplier for categorical columns",
208
+ )
209
+ complex_multiplier: float = Field(
210
+ default=3.0,
211
+ ge=1.0,
212
+ le=10.0,
213
+ description="Sample multiplier for complex types (List/Struct)",
214
+ )
215
+ numeric_multiplier: float = Field(
216
+ default=1.0,
217
+ ge=0.5,
218
+ le=2.0,
219
+ description="Baseline multiplier for numeric columns",
220
+ )
221
+
222
+
223
+ class ProgressiveSamplingConfig(BaseModel):
224
+ """Progressive sampling with convergence detection."""
225
+
226
+ model_config = ConfigDict(extra="forbid")
227
+
228
+ convergence_threshold: float = Field(
229
+ default=0.01,
230
+ ge=0.001,
231
+ le=0.1,
232
+ description="Convergence threshold (stop when estimates stabilize)",
233
+ )
234
+ max_stages: int = Field(
235
+ default=5,
236
+ ge=2,
237
+ le=10,
238
+ description="Maximum number of progressive stages",
239
+ )
240
+ initial_sample_ratio: float = Field(
241
+ default=0.01,
242
+ ge=0.001,
243
+ le=0.1,
244
+ description="Initial sample ratio (0.01 = 1%)",
245
+ )
246
+ growth_factor: float = Field(
247
+ default=2.0,
248
+ ge=1.5,
249
+ le=4.0,
250
+ description="Sample size growth factor per stage",
251
+ )
252
+
253
+
254
+ class SketchConfig(BaseModel):
255
+ """Probabilistic data structure configuration."""
256
+
257
+ model_config = ConfigDict(extra="forbid")
258
+
259
+ sketch_type: SketchType = Field(
260
+ default=SketchType.HYPERLOGLOG,
261
+ description="Type of sketch to use",
262
+ )
263
+
264
+ # HyperLogLog parameters
265
+ hll_precision: int = Field(
266
+ default=14,
267
+ ge=10,
268
+ le=18,
269
+ description="HyperLogLog precision (10-18, higher = more accurate)",
270
+ )
271
+
272
+ # Count-Min Sketch parameters
273
+ cms_width: int = Field(
274
+ default=2000,
275
+ ge=100,
276
+ le=100000,
277
+ description="Count-Min Sketch width",
278
+ )
279
+ cms_depth: int = Field(
280
+ default=5,
281
+ ge=3,
282
+ le=10,
283
+ description="Count-Min Sketch depth",
284
+ )
285
+ cms_epsilon: float | None = Field(
286
+ default=None,
287
+ description="Error bound (alternative to width)",
288
+ )
289
+ cms_delta: float | None = Field(
290
+ default=None,
291
+ description="Confidence level (alternative to depth)",
292
+ )
293
+
294
+ # Bloom Filter parameters
295
+ bloom_capacity: int = Field(
296
+ default=10_000_000,
297
+ ge=1000,
298
+ description="Expected number of items",
299
+ )
300
+ bloom_error_rate: float = Field(
301
+ default=0.01,
302
+ ge=0.0001,
303
+ le=0.1,
304
+ description="Desired false positive rate",
305
+ )
306
+
307
+
308
+ # ============================================================================
309
+ # Main Request/Response Models
310
+ # ============================================================================
311
+
312
+
313
+ class EnterpriseSamplingRequest(BaseModel):
314
+ """Request model for enterprise-scale sampling operations."""
315
+
316
+ model_config = ConfigDict(extra="forbid")
317
+
318
+ # Basic parameters
319
+ source_id: str = Field(..., description="Source ID to sample from")
320
+ target_rows: int = Field(
321
+ default=100_000,
322
+ ge=1000,
323
+ le=10_000_000,
324
+ description="Target number of rows to sample",
325
+ )
326
+ quality: SamplingQuality = Field(
327
+ default=SamplingQuality.STANDARD,
328
+ description="Sampling quality preset",
329
+ )
330
+
331
+ # Strategy selection
332
+ strategy: EnterpriseSamplingStrategy = Field(
333
+ default=EnterpriseSamplingStrategy.ADAPTIVE,
334
+ description="Sampling strategy (adaptive = auto-select)",
335
+ )
336
+
337
+ # Resource budgets
338
+ memory_budget: MemoryBudgetConfig = Field(
339
+ default_factory=MemoryBudgetConfig,
340
+ description="Memory budget configuration",
341
+ )
342
+ time_budget_seconds: float = Field(
343
+ default=0.0,
344
+ ge=0.0,
345
+ le=3600.0,
346
+ description="Time budget in seconds (0 = unlimited)",
347
+ )
348
+
349
+ # Statistical parameters
350
+ confidence_level: float = Field(
351
+ default=0.95,
352
+ ge=0.80,
353
+ le=0.99,
354
+ description="Statistical confidence level",
355
+ )
356
+ margin_of_error: float = Field(
357
+ default=0.05,
358
+ ge=0.01,
359
+ le=0.10,
360
+ description="Acceptable margin of error",
361
+ )
362
+
363
+ # Adaptive parameters
364
+ min_sample_ratio: float = Field(
365
+ default=0.001,
366
+ ge=0.0001,
367
+ le=0.1,
368
+ description="Minimum sample ratio",
369
+ )
370
+ max_sample_ratio: float = Field(
371
+ default=0.10,
372
+ ge=0.01,
373
+ le=1.0,
374
+ description="Maximum sample ratio",
375
+ )
376
+
377
+ # Reproducibility
378
+ seed: int | None = Field(
379
+ default=None,
380
+ description="Random seed for reproducibility",
381
+ )
382
+
383
+ # Strategy-specific configurations
384
+ block_config: BlockSamplingConfig | None = Field(
385
+ default=None,
386
+ description="Block sampling configuration",
387
+ )
388
+ multi_stage_config: MultiStageSamplingConfig | None = Field(
389
+ default=None,
390
+ description="Multi-stage sampling configuration",
391
+ )
392
+ column_aware_config: ColumnAwareSamplingConfig | None = Field(
393
+ default=None,
394
+ description="Column-aware sampling configuration",
395
+ )
396
+ progressive_config: ProgressiveSamplingConfig | None = Field(
397
+ default=None,
398
+ description="Progressive sampling configuration",
399
+ )
400
+
401
+ # Sketch parameters (for XXLARGE datasets)
402
+ sketch_config: SketchConfig | None = Field(
403
+ default=None,
404
+ description="Probabilistic sketch configuration",
405
+ )
406
+
407
+
408
+ class SamplingMetrics(BaseModel):
409
+ """Metrics from sampling operation."""
410
+
411
+ model_config = ConfigDict(extra="forbid")
412
+
413
+ # Basic metrics
414
+ original_rows: int = Field(..., description="Original row count")
415
+ sampled_rows: int = Field(..., description="Sampled row count")
416
+ sampling_ratio: float = Field(..., description="Actual sampling ratio")
417
+
418
+ # Strategy info
419
+ strategy_used: EnterpriseSamplingStrategy = Field(..., description="Strategy used")
420
+ scale_category: ScaleCategory = Field(..., description="Dataset scale category")
421
+ is_sampled: bool = Field(..., description="Whether sampling was performed")
422
+
423
+ # Performance metrics
424
+ sampling_time_ms: float = Field(..., description="Total sampling time in ms")
425
+ throughput_rows_per_sec: float = Field(..., description="Processing throughput")
426
+ speedup_factor: float = Field(
427
+ default=1.0,
428
+ description="Speedup compared to full scan",
429
+ )
430
+
431
+ # Resource usage
432
+ peak_memory_mb: float = Field(default=0.0, description="Peak memory usage in MB")
433
+ workers_used: int = Field(default=1, description="Number of workers used")
434
+ worker_utilization: float = Field(
435
+ default=0.0,
436
+ description="Worker utilization (0.0-1.0)",
437
+ )
438
+
439
+ # Block metrics (for block-based strategies)
440
+ blocks_processed: int | None = Field(
441
+ default=None,
442
+ description="Number of blocks processed",
443
+ )
444
+ time_per_block_ms: float | None = Field(
445
+ default=None,
446
+ description="Average time per block",
447
+ )
448
+
449
+ # Progressive metrics
450
+ stages_completed: int | None = Field(
451
+ default=None,
452
+ description="Number of progressive stages",
453
+ )
454
+ converged_early: bool | None = Field(
455
+ default=None,
456
+ description="Whether converged before max stages",
457
+ )
458
+
459
+ # Backpressure metrics
460
+ backpressure_events: int = Field(
461
+ default=0,
462
+ description="Number of backpressure events",
463
+ )
464
+
465
+ # Statistical info
466
+ margin_of_error_actual: float | None = Field(
467
+ default=None,
468
+ description="Achieved margin of error",
469
+ )
470
+ confidence_achieved: float | None = Field(
471
+ default=None,
472
+ description="Achieved confidence level",
473
+ )
474
+
475
+
476
+ class EnterpriseSamplingResponse(BaseModel):
477
+ """Response model for enterprise sampling operations."""
478
+
479
+ model_config = ConfigDict(extra="forbid")
480
+
481
+ # Request info
482
+ source_id: str = Field(..., description="Source ID")
483
+ job_id: str = Field(..., description="Sampling job ID")
484
+
485
+ # Status
486
+ status: str = Field(..., description="Job status: pending, running, completed, failed")
487
+ started_at: datetime = Field(..., description="Job start time")
488
+ completed_at: datetime | None = Field(None, description="Job completion time")
489
+
490
+ # Results
491
+ metrics: SamplingMetrics | None = Field(
492
+ None,
493
+ description="Sampling metrics (available when completed)",
494
+ )
495
+ sampled_data_path: str | None = Field(
496
+ None,
497
+ description="Path to sampled data file",
498
+ )
499
+
500
+ # Error info
501
+ error_message: str | None = Field(None, description="Error message if failed")
502
+
503
+
504
+ class SampleSizeEstimateRequest(BaseModel):
505
+ """Request for sample size estimation."""
506
+
507
+ model_config = ConfigDict(extra="forbid")
508
+
509
+ population_size: int = Field(..., ge=1, description="Total population size")
510
+ confidence_level: float = Field(
511
+ default=0.95,
512
+ ge=0.80,
513
+ le=0.99,
514
+ description="Desired confidence level",
515
+ )
516
+ margin_of_error: float = Field(
517
+ default=0.05,
518
+ ge=0.01,
519
+ le=0.10,
520
+ description="Desired margin of error",
521
+ )
522
+ quality: SamplingQuality = Field(
523
+ default=SamplingQuality.STANDARD,
524
+ description="Quality preset",
525
+ )
526
+
527
+
528
+ class SampleSizeEstimateResponse(BaseModel):
529
+ """Response with sample size recommendations."""
530
+
531
+ model_config = ConfigDict(extra="forbid")
532
+
533
+ population_size: int = Field(..., description="Input population size")
534
+ scale_category: ScaleCategory = Field(..., description="Dataset scale category")
535
+
536
+ # Recommended sizes
537
+ recommended_size: int = Field(..., description="Recommended sample size")
538
+ min_size: int = Field(..., description="Minimum acceptable sample size")
539
+ max_size: int = Field(..., description="Maximum useful sample size")
540
+
541
+ # Estimates
542
+ estimated_time_seconds: float = Field(..., description="Estimated processing time")
543
+ estimated_memory_mb: float = Field(..., description="Estimated memory usage")
544
+ speedup_factor: float = Field(..., description="Expected speedup factor")
545
+
546
+ # Strategy recommendation
547
+ recommended_strategy: EnterpriseSamplingStrategy = Field(
548
+ ...,
549
+ description="Recommended sampling strategy",
550
+ )
551
+ strategy_rationale: str = Field(..., description="Why this strategy is recommended")
552
+
553
+
554
+ class SketchEstimateRequest(BaseModel):
555
+ """Request for sketch-based estimation."""
556
+
557
+ model_config = ConfigDict(extra="forbid")
558
+
559
+ source_id: str = Field(..., description="Source ID")
560
+ columns: list[str] = Field(..., min_length=1, description="Columns to analyze")
561
+ sketch_type: SketchType = Field(..., description="Sketch type")
562
+ sketch_config: SketchConfig | None = Field(
563
+ None,
564
+ description="Sketch configuration",
565
+ )
566
+
567
+
568
+ class SketchEstimateResult(BaseModel):
569
+ """Result from sketch-based estimation."""
570
+
571
+ model_config = ConfigDict(extra="forbid")
572
+
573
+ column: str = Field(..., description="Column name")
574
+ sketch_type: SketchType = Field(..., description="Sketch type used")
575
+
576
+ # HyperLogLog results
577
+ cardinality_estimate: int | None = Field(
578
+ None,
579
+ description="Estimated distinct count",
580
+ )
581
+ cardinality_error: float | None = Field(
582
+ None,
583
+ description="Standard error of cardinality estimate",
584
+ )
585
+
586
+ # Count-Min Sketch results
587
+ heavy_hitters: list[dict[str, Any]] | None = Field(
588
+ None,
589
+ description="Frequent items with estimated counts",
590
+ )
591
+
592
+ # Bloom Filter results
593
+ membership_tests: dict[str, bool] | None = Field(
594
+ None,
595
+ description="Membership test results",
596
+ )
597
+
598
+ # Common metrics
599
+ memory_used_bytes: int = Field(..., description="Memory used by sketch")
600
+ processing_time_ms: float = Field(..., description="Processing time in ms")
601
+
602
+
603
+ class SketchEstimateResponse(BaseModel):
604
+ """Response with sketch-based estimates."""
605
+
606
+ model_config = ConfigDict(extra="forbid")
607
+
608
+ source_id: str = Field(..., description="Source ID")
609
+ results: list[SketchEstimateResult] = Field(..., description="Results per column")
610
+ total_time_ms: float = Field(..., description="Total processing time")
611
+ total_memory_mb: float = Field(..., description="Total memory used")
612
+
613
+
614
+ # ============================================================================
615
+ # Job Management Models
616
+ # ============================================================================
617
+
618
+
619
+ class SamplingJobStatus(BaseModel):
620
+ """Sampling job status for monitoring."""
621
+
622
+ model_config = ConfigDict(extra="forbid")
623
+
624
+ job_id: str = Field(..., description="Job ID")
625
+ source_id: str = Field(..., description="Source ID")
626
+ status: str = Field(..., description="Job status")
627
+ progress: float = Field(
628
+ default=0.0,
629
+ ge=0.0,
630
+ le=1.0,
631
+ description="Progress (0.0-1.0)",
632
+ )
633
+ current_stage: str | None = Field(None, description="Current processing stage")
634
+ started_at: datetime = Field(..., description="Start time")
635
+ estimated_completion: datetime | None = Field(
636
+ None,
637
+ description="Estimated completion time",
638
+ )
639
+
640
+ # Progress details
641
+ rows_processed: int = Field(default=0, description="Rows processed so far")
642
+ blocks_completed: int | None = Field(None, description="Blocks completed")
643
+ blocks_total: int | None = Field(None, description="Total blocks")
644
+
645
+
646
+ class SamplingJobListResponse(BaseModel):
647
+ """Response listing sampling jobs."""
648
+
649
+ model_config = ConfigDict(extra="forbid")
650
+
651
+ jobs: list[SamplingJobStatus] = Field(..., description="List of jobs")
652
+ total: int = Field(..., description="Total job count")
653
+ active_count: int = Field(..., description="Active job count")
@@ -246,39 +246,6 @@ class ImpactAnalysisResponse(BaseSchema):
246
246
  total_affected: int = Field(default=0, description="Total affected nodes")
247
247
 
248
248
 
249
- # =============================================================================
250
- # Auto-Discovery Schemas
251
- # =============================================================================
252
-
253
-
254
- class AutoDiscoverRequest(BaseSchema):
255
- """Request to auto-discover lineage from a source."""
256
-
257
- source_id: str = Field(..., description="Source ID to discover from")
258
- include_fk_relations: bool = Field(
259
- default=True,
260
- description="Include foreign key relationships (for DB sources)",
261
- )
262
- max_depth: int = Field(
263
- default=3,
264
- ge=1,
265
- le=10,
266
- description="Maximum depth for discovery",
267
- )
268
-
269
-
270
- class AutoDiscoverResponse(BaseSchema):
271
- """Response from auto-discovery."""
272
-
273
- source_id: str = Field(..., description="Source ID that was analyzed")
274
- discovered_nodes: int = Field(default=0, description="Number of nodes discovered")
275
- discovered_edges: int = Field(default=0, description="Number of edges discovered")
276
- graph: LineageGraphResponse = Field(
277
- ...,
278
- description="Discovered lineage graph",
279
- )
280
-
281
-
282
249
  # =============================================================================
283
250
  # Position Update Schemas
284
251
  # =============================================================================