truthound-dashboard 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.3.dist-info/METADATA +0 -505
  203. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1291 @@
1
+ """Enterprise-scale sampling strategies for large datasets.
2
+
3
+ This module provides the core business logic for truthound 1.2.10's enterprise
4
+ sampling capabilities, supporting datasets from 100M to billions of rows.
5
+
6
+ Architecture:
7
+ - Strategy Pattern: Each sampling method is a separate strategy class
8
+ - Factory Pattern: SamplerFactory creates appropriate sampler based on scale
9
+ - Template Method: Base class defines sampling workflow, strategies implement specifics
10
+
11
+ Strategies:
12
+ 1. BlockSamplingStrategy: Divides data into blocks, samples proportionally
13
+ 2. MultiStageSamplingStrategy: Hierarchical sampling in multiple passes
14
+ 3. ColumnAwareSamplingStrategy: Adjusts sampling based on column types
15
+ 4. ProgressiveSamplingStrategy: Iterative sampling until convergence
16
+ 5. EnterpriseScaleSampler: Orchestrator that auto-selects best strategy
17
+
18
+ Example:
19
+ from truthound_dashboard.core.enterprise_sampling import (
20
+ EnterpriseScaleSampler,
21
+ classify_dataset_scale,
22
+ )
23
+
24
+ sampler = EnterpriseScaleSampler()
25
+ result = await sampler.sample(source_id, config)
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import asyncio
31
+ import logging
32
+ import math
33
+ import time
34
+ import uuid
35
+ from abc import ABC, abstractmethod
36
+ from dataclasses import dataclass, field
37
+ from datetime import datetime
38
+ from pathlib import Path
39
+ from typing import Any
40
+
41
+ from truthound_dashboard.schemas.enterprise_sampling import (
42
+ BlockSamplingConfig,
43
+ ColumnAwareSamplingConfig,
44
+ EnterpriseSamplingRequest,
45
+ EnterpriseSamplingResponse,
46
+ EnterpriseSamplingStrategy,
47
+ MemoryBudgetConfig,
48
+ MultiStageSamplingConfig,
49
+ ParallelSamplingConfig,
50
+ ProgressiveSamplingConfig,
51
+ SampleSizeEstimateRequest,
52
+ SampleSizeEstimateResponse,
53
+ SamplingJobStatus,
54
+ SamplingMetrics,
55
+ SamplingQuality,
56
+ ScaleCategory,
57
+ SchedulingPolicy,
58
+ SketchConfig,
59
+ SketchEstimateRequest,
60
+ SketchEstimateResponse,
61
+ SketchEstimateResult,
62
+ SketchType,
63
+ )
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+
68
+ # ============================================================================
69
+ # Constants
70
+ # ============================================================================
71
+
72
+ # Scale category thresholds
73
+ SCALE_THRESHOLDS = {
74
+ ScaleCategory.SMALL: 1_000_000,
75
+ ScaleCategory.MEDIUM: 10_000_000,
76
+ ScaleCategory.LARGE: 100_000_000,
77
+ ScaleCategory.XLARGE: 1_000_000_000,
78
+ # XXLARGE: > 1B
79
+ }
80
+
81
+ # Quality preset configurations
82
+ QUALITY_PRESETS = {
83
+ SamplingQuality.SKETCH: {
84
+ "target_rows": 10_000,
85
+ "confidence_level": 0.80,
86
+ "margin_of_error": 0.10,
87
+ },
88
+ SamplingQuality.QUICK: {
89
+ "target_rows": 50_000,
90
+ "confidence_level": 0.90,
91
+ "margin_of_error": 0.05,
92
+ },
93
+ SamplingQuality.STANDARD: {
94
+ "target_rows": 100_000,
95
+ "confidence_level": 0.95,
96
+ "margin_of_error": 0.05,
97
+ },
98
+ SamplingQuality.HIGH: {
99
+ "target_rows": 500_000,
100
+ "confidence_level": 0.99,
101
+ "margin_of_error": 0.03,
102
+ },
103
+ SamplingQuality.EXACT: {
104
+ "target_rows": None, # Full scan
105
+ "confidence_level": 1.0,
106
+ "margin_of_error": 0.0,
107
+ },
108
+ }
109
+
110
+ # Strategy recommendations by scale
111
+ SCALE_STRATEGY_MAP = {
112
+ ScaleCategory.SMALL: EnterpriseSamplingStrategy.NONE,
113
+ ScaleCategory.MEDIUM: EnterpriseSamplingStrategy.COLUMN_AWARE,
114
+ ScaleCategory.LARGE: EnterpriseSamplingStrategy.BLOCK,
115
+ ScaleCategory.XLARGE: EnterpriseSamplingStrategy.MULTI_STAGE,
116
+ ScaleCategory.XXLARGE: EnterpriseSamplingStrategy.MULTI_STAGE,
117
+ }
118
+
119
+
120
+ # ============================================================================
121
+ # Utility Functions
122
+ # ============================================================================
123
+
124
+
125
+ def classify_dataset_scale(row_count: int) -> ScaleCategory:
126
+ """Classify dataset by scale category.
127
+
128
+ Args:
129
+ row_count: Number of rows in dataset.
130
+
131
+ Returns:
132
+ ScaleCategory enum value.
133
+ """
134
+ if row_count < SCALE_THRESHOLDS[ScaleCategory.SMALL]:
135
+ return ScaleCategory.SMALL
136
+ elif row_count < SCALE_THRESHOLDS[ScaleCategory.MEDIUM]:
137
+ return ScaleCategory.MEDIUM
138
+ elif row_count < SCALE_THRESHOLDS[ScaleCategory.LARGE]:
139
+ return ScaleCategory.LARGE
140
+ elif row_count < SCALE_THRESHOLDS[ScaleCategory.XLARGE]:
141
+ return ScaleCategory.XLARGE
142
+ else:
143
+ return ScaleCategory.XXLARGE
144
+
145
+
146
+ def calculate_cochran_sample_size(
147
+ population_size: int,
148
+ confidence_level: float = 0.95,
149
+ margin_of_error: float = 0.05,
150
+ p: float = 0.5,
151
+ ) -> int:
152
+ """Calculate optimal sample size using Cochran's formula.
153
+
154
+ Args:
155
+ population_size: Total population size (N).
156
+ confidence_level: Desired confidence level (e.g., 0.95).
157
+ margin_of_error: Acceptable margin of error (e.g., 0.05).
158
+ p: Expected proportion (0.5 for maximum variability).
159
+
160
+ Returns:
161
+ Recommended sample size.
162
+ """
163
+ # Z-scores for common confidence levels
164
+ z_scores = {
165
+ 0.80: 1.28,
166
+ 0.85: 1.44,
167
+ 0.90: 1.645,
168
+ 0.95: 1.96,
169
+ 0.99: 2.576,
170
+ }
171
+
172
+ # Get closest z-score
173
+ z = z_scores.get(confidence_level, 1.96)
174
+
175
+ # Cochran's formula for infinite population
176
+ n0 = (z**2 * p * (1 - p)) / (margin_of_error**2)
177
+
178
+ # Finite population correction
179
+ n = n0 / (1 + (n0 - 1) / population_size)
180
+
181
+ return max(int(math.ceil(n)), 100) # Minimum 100 samples
182
+
183
+
184
+ def estimate_processing_time(
185
+ row_count: int,
186
+ strategy: EnterpriseSamplingStrategy,
187
+ workers: int = 4,
188
+ ) -> float:
189
+ """Estimate processing time in seconds.
190
+
191
+ Args:
192
+ row_count: Number of rows to process.
193
+ strategy: Sampling strategy.
194
+ workers: Number of parallel workers.
195
+
196
+ Returns:
197
+ Estimated time in seconds.
198
+ """
199
+ # Base throughput estimates (rows/second per worker)
200
+ throughput_map = {
201
+ EnterpriseSamplingStrategy.NONE: 10_000_000, # Full scan
202
+ EnterpriseSamplingStrategy.RANDOM: 5_000_000,
203
+ EnterpriseSamplingStrategy.BLOCK: 2_000_000,
204
+ EnterpriseSamplingStrategy.MULTI_STAGE: 1_000_000,
205
+ EnterpriseSamplingStrategy.COLUMN_AWARE: 3_000_000,
206
+ EnterpriseSamplingStrategy.PROGRESSIVE: 2_500_000,
207
+ }
208
+
209
+ base_throughput = throughput_map.get(strategy, 1_000_000)
210
+
211
+ # Parallel speedup (not perfectly linear)
212
+ parallel_efficiency = 0.7 if workers > 1 else 1.0
213
+ effective_throughput = base_throughput * workers * parallel_efficiency
214
+
215
+ return row_count / effective_throughput
216
+
217
+
218
+ def estimate_memory_usage(
219
+ row_count: int,
220
+ column_count: int,
221
+ strategy: EnterpriseSamplingStrategy,
222
+ ) -> float:
223
+ """Estimate memory usage in MB.
224
+
225
+ Args:
226
+ row_count: Number of rows.
227
+ column_count: Number of columns.
228
+ strategy: Sampling strategy.
229
+
230
+ Returns:
231
+ Estimated memory in MB.
232
+ """
233
+ # Base memory per row (rough estimate: 50 bytes per column)
234
+ bytes_per_row = column_count * 50
235
+
236
+ # Strategy-specific memory factors
237
+ memory_factors = {
238
+ EnterpriseSamplingStrategy.NONE: 1.0,
239
+ EnterpriseSamplingStrategy.RANDOM: 0.1,
240
+ EnterpriseSamplingStrategy.BLOCK: 0.2, # Block buffer
241
+ EnterpriseSamplingStrategy.MULTI_STAGE: 0.15,
242
+ EnterpriseSamplingStrategy.COLUMN_AWARE: 0.12,
243
+ EnterpriseSamplingStrategy.PROGRESSIVE: 0.1,
244
+ }
245
+
246
+ factor = memory_factors.get(strategy, 0.1)
247
+ memory_bytes = row_count * bytes_per_row * factor
248
+
249
+ # Add overhead
250
+ memory_bytes *= 1.2
251
+
252
+ return memory_bytes / (1024 * 1024)
253
+
254
+
255
+ # ============================================================================
256
+ # Sampling Result Data Classes
257
+ # ============================================================================
258
+
259
+
260
+ @dataclass
261
+ class SamplingContext:
262
+ """Context passed through the sampling pipeline."""
263
+
264
+ source_id: str
265
+ job_id: str
266
+ config: EnterpriseSamplingRequest
267
+ row_count: int
268
+ column_count: int
269
+ scale_category: ScaleCategory
270
+ start_time: float = field(default_factory=time.time)
271
+
272
+ # Runtime state
273
+ rows_processed: int = 0
274
+ blocks_completed: int = 0
275
+ blocks_total: int = 0
276
+ current_stage: str = "initializing"
277
+
278
+ # Memory tracking
279
+ peak_memory_mb: float = 0.0
280
+ backpressure_events: int = 0
281
+
282
+ def elapsed_ms(self) -> float:
283
+ """Get elapsed time in milliseconds."""
284
+ return (time.time() - self.start_time) * 1000
285
+
286
+
287
+ @dataclass
288
+ class SamplingOutput:
289
+ """Output from sampling operation."""
290
+
291
+ sampled_data: Any # Polars DataFrame or LazyFrame
292
+ sampled_rows: int
293
+ output_path: str | None = None
294
+
295
+ # Strategy-specific metadata
296
+ blocks_processed: int | None = None
297
+ stages_completed: int | None = None
298
+ converged_early: bool | None = None
299
+
300
+
301
+ # ============================================================================
302
+ # Abstract Base Strategy
303
+ # ============================================================================
304
+
305
+
306
+ class BaseSamplingStrategy(ABC):
307
+ """Abstract base class for sampling strategies.
308
+
309
+ Implements Template Method pattern - subclasses implement
310
+ `_do_sample()` while base class handles common logic.
311
+ """
312
+
313
+ @property
314
+ @abstractmethod
315
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
316
+ """Get strategy type identifier."""
317
+ ...
318
+
319
+ @property
320
+ def supports_parallel(self) -> bool:
321
+ """Whether strategy supports parallel execution."""
322
+ return False
323
+
324
+ @property
325
+ def supports_streaming(self) -> bool:
326
+ """Whether strategy supports streaming."""
327
+ return False
328
+
329
+ async def sample(
330
+ self,
331
+ context: SamplingContext,
332
+ data: Any,
333
+ ) -> SamplingOutput:
334
+ """Execute sampling with common pre/post processing.
335
+
336
+ Args:
337
+ context: Sampling context with configuration.
338
+ data: Input data (Polars LazyFrame).
339
+
340
+ Returns:
341
+ SamplingOutput with sampled data.
342
+ """
343
+ context.current_stage = f"{self.strategy_type.value}_sampling"
344
+
345
+ try:
346
+ # Pre-sampling validation
347
+ self._validate_input(context, data)
348
+
349
+ # Execute strategy-specific sampling
350
+ output = await self._do_sample(context, data)
351
+
352
+ # Post-processing
353
+ output = self._post_process(context, output)
354
+
355
+ return output
356
+
357
+ except Exception as e:
358
+ logger.error(f"Sampling failed: {e}")
359
+ raise
360
+
361
+ def _validate_input(self, context: SamplingContext, data: Any) -> None:
362
+ """Validate input data before sampling."""
363
+ if data is None:
364
+ raise ValueError("Input data cannot be None")
365
+
366
+ @abstractmethod
367
+ async def _do_sample(
368
+ self,
369
+ context: SamplingContext,
370
+ data: Any,
371
+ ) -> SamplingOutput:
372
+ """Strategy-specific sampling implementation.
373
+
374
+ Args:
375
+ context: Sampling context.
376
+ data: Input data.
377
+
378
+ Returns:
379
+ SamplingOutput with results.
380
+ """
381
+ ...
382
+
383
+ def _post_process(
384
+ self,
385
+ context: SamplingContext,
386
+ output: SamplingOutput,
387
+ ) -> SamplingOutput:
388
+ """Post-process sampling output."""
389
+ return output
390
+
391
+
392
+ # ============================================================================
393
+ # Concrete Strategies
394
+ # ============================================================================
395
+
396
+
397
+ class NoSamplingStrategy(BaseSamplingStrategy):
398
+ """No sampling - use full dataset."""
399
+
400
+ @property
401
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
402
+ return EnterpriseSamplingStrategy.NONE
403
+
404
+ async def _do_sample(
405
+ self,
406
+ context: SamplingContext,
407
+ data: Any,
408
+ ) -> SamplingOutput:
409
+ """Return data as-is."""
410
+ return SamplingOutput(
411
+ sampled_data=data,
412
+ sampled_rows=context.row_count,
413
+ )
414
+
415
+
416
+ class BlockSamplingStrategy(BaseSamplingStrategy):
417
+ """Block-based sampling for 10M-100M row datasets.
418
+
419
+ Divides data into fixed-size blocks and samples proportionally
420
+ from each block. Ensures even coverage across the dataset.
421
+ """
422
+
423
+ @property
424
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
425
+ return EnterpriseSamplingStrategy.BLOCK
426
+
427
+ @property
428
+ def supports_parallel(self) -> bool:
429
+ return True
430
+
431
+ def __init__(self, config: BlockSamplingConfig | None = None):
432
+ self.config = config or BlockSamplingConfig()
433
+
434
+ async def _do_sample(
435
+ self,
436
+ context: SamplingContext,
437
+ data: Any,
438
+ ) -> SamplingOutput:
439
+ """Perform block-based sampling."""
440
+ import polars as pl
441
+
442
+ target_rows = context.config.target_rows
443
+
444
+ # Calculate block size
445
+ block_size = self.config.block_size
446
+ if block_size == 0:
447
+ # Auto-detect: aim for ~100 blocks
448
+ block_size = max(context.row_count // 100, 10_000)
449
+
450
+ num_blocks = math.ceil(context.row_count / block_size)
451
+ context.blocks_total = num_blocks
452
+
453
+ # Calculate samples per block
454
+ samples_per_block = self.config.sample_per_block
455
+ if samples_per_block is None:
456
+ samples_per_block = max(target_rows // num_blocks, 1)
457
+
458
+ logger.info(
459
+ f"Block sampling: {num_blocks} blocks, "
460
+ f"{samples_per_block} samples/block"
461
+ )
462
+
463
+ # Collect data and sample from each block
464
+ # In production, this would use truthound's block sampler
465
+ df = data.collect() if hasattr(data, "collect") else data
466
+ seed = context.config.seed or 42
467
+
468
+ sampled_dfs = []
469
+ for i in range(num_blocks):
470
+ start_idx = i * block_size
471
+ end_idx = min((i + 1) * block_size, len(df))
472
+ block = df.slice(start_idx, end_idx - start_idx)
473
+
474
+ if len(block) > samples_per_block:
475
+ block = block.sample(n=samples_per_block, seed=seed + i)
476
+
477
+ sampled_dfs.append(block)
478
+ context.blocks_completed = i + 1
479
+
480
+ # Combine sampled blocks
481
+ sampled = pl.concat(sampled_dfs)
482
+
483
+ # Trim to target if oversampled
484
+ if len(sampled) > target_rows:
485
+ sampled = sampled.sample(n=target_rows, seed=seed)
486
+
487
+ return SamplingOutput(
488
+ sampled_data=sampled.lazy(),
489
+ sampled_rows=len(sampled),
490
+ blocks_processed=num_blocks,
491
+ )
492
+
493
+
494
+ class MultiStageSamplingStrategy(BaseSamplingStrategy):
495
+ """Multi-stage hierarchical sampling for 100M-1B row datasets.
496
+
497
+ Progressively reduces data in multiple stages. Each stage
498
+ reduces by factor (total_rows / target)^(1/stages).
499
+ """
500
+
501
+ @property
502
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
503
+ return EnterpriseSamplingStrategy.MULTI_STAGE
504
+
505
+ def __init__(self, config: MultiStageSamplingConfig | None = None):
506
+ self.config = config or MultiStageSamplingConfig()
507
+
508
+ async def _do_sample(
509
+ self,
510
+ context: SamplingContext,
511
+ data: Any,
512
+ ) -> SamplingOutput:
513
+ """Perform multi-stage sampling."""
514
+ import polars as pl
515
+
516
+ target_rows = context.config.target_rows
517
+ num_stages = self.config.num_stages
518
+ seed = context.config.seed or 42
519
+
520
+ # Calculate reduction factor per stage
521
+ if self.config.stage_reduction_factor:
522
+ reduction = self.config.stage_reduction_factor
523
+ else:
524
+ reduction = (context.row_count / target_rows) ** (1 / num_stages)
525
+
526
+ logger.info(
527
+ f"Multi-stage sampling: {num_stages} stages, "
528
+ f"{reduction:.2f}x reduction per stage"
529
+ )
530
+
531
+ # Collect initial data
532
+ current_data = data.collect() if hasattr(data, "collect") else data
533
+ current_rows = len(current_data)
534
+
535
+ stages_completed = 0
536
+ for stage in range(num_stages):
537
+ target_stage_rows = int(current_rows / reduction)
538
+ target_stage_rows = max(target_stage_rows, target_rows)
539
+
540
+ if target_stage_rows >= current_rows:
541
+ break
542
+
543
+ current_data = current_data.sample(
544
+ n=target_stage_rows,
545
+ seed=seed + stage,
546
+ )
547
+ current_rows = len(current_data)
548
+ stages_completed = stage + 1
549
+
550
+ logger.debug(f"Stage {stage + 1}: {current_rows} rows")
551
+
552
+ # Early stopping check
553
+ if self.config.early_stop_enabled and current_rows <= target_rows:
554
+ break
555
+
556
+ # Final trim to exact target
557
+ if current_rows > target_rows:
558
+ current_data = current_data.sample(n=target_rows, seed=seed)
559
+
560
+ return SamplingOutput(
561
+ sampled_data=current_data.lazy(),
562
+ sampled_rows=len(current_data),
563
+ stages_completed=stages_completed,
564
+ converged_early=stages_completed < num_stages,
565
+ )
566
+
567
+
568
+ class ColumnAwareSamplingStrategy(BaseSamplingStrategy):
569
+ """Column-aware adaptive sampling for mixed-type datasets.
570
+
571
+ Adjusts sample size based on column type complexity:
572
+ - Strings: 2x multiplier (high cardinality)
573
+ - Categoricals: 0.5x multiplier (low cardinality)
574
+ - Complex types: 3x multiplier (List/Struct)
575
+ - Numeric: 1x baseline
576
+ """
577
+
578
+ @property
579
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
580
+ return EnterpriseSamplingStrategy.COLUMN_AWARE
581
+
582
+ def __init__(self, config: ColumnAwareSamplingConfig | None = None):
583
+ self.config = config or ColumnAwareSamplingConfig()
584
+
585
+ async def _do_sample(
586
+ self,
587
+ context: SamplingContext,
588
+ data: Any,
589
+ ) -> SamplingOutput:
590
+ """Perform column-aware sampling."""
591
+ import polars as pl
592
+
593
+ target_rows = context.config.target_rows
594
+ seed = context.config.seed or 42
595
+
596
+ # Collect schema info
597
+ if hasattr(data, "collect_schema"):
598
+ schema = data.collect_schema()
599
+ else:
600
+ schema = data.schema
601
+
602
+ # Calculate adjusted sample size based on column types
603
+ type_multipliers = []
604
+ for col_name, dtype in schema.items():
605
+ dtype_str = str(dtype).lower()
606
+
607
+ if "string" in dtype_str or "utf8" in dtype_str:
608
+ type_multipliers.append(self.config.string_multiplier)
609
+ elif "categorical" in dtype_str or "enum" in dtype_str:
610
+ type_multipliers.append(self.config.categorical_multiplier)
611
+ elif "list" in dtype_str or "struct" in dtype_str:
612
+ type_multipliers.append(self.config.complex_multiplier)
613
+ else:
614
+ type_multipliers.append(self.config.numeric_multiplier)
615
+
616
+ # Use average multiplier
617
+ avg_multiplier = sum(type_multipliers) / len(type_multipliers)
618
+ adjusted_target = int(target_rows * avg_multiplier)
619
+ adjusted_target = min(adjusted_target, context.row_count)
620
+
621
+ logger.info(
622
+ f"Column-aware sampling: {len(type_multipliers)} columns, "
623
+ f"avg multiplier {avg_multiplier:.2f}, "
624
+ f"adjusted target {adjusted_target}"
625
+ )
626
+
627
+ # Perform sampling
628
+ df = data.collect() if hasattr(data, "collect") else data
629
+
630
+ if len(df) > adjusted_target:
631
+ df = df.sample(n=adjusted_target, seed=seed)
632
+
633
+ return SamplingOutput(
634
+ sampled_data=df.lazy(),
635
+ sampled_rows=len(df),
636
+ )
637
+
638
+
639
+ class ProgressiveSamplingStrategy(BaseSamplingStrategy):
640
+ """Progressive sampling with convergence detection.
641
+
642
+ Iteratively increases sample size until estimates stabilize
643
+ within convergence threshold. Supports early stopping.
644
+ """
645
+
646
+ @property
647
+ def strategy_type(self) -> EnterpriseSamplingStrategy:
648
+ return EnterpriseSamplingStrategy.PROGRESSIVE
649
+
650
+ def __init__(self, config: ProgressiveSamplingConfig | None = None):
651
+ self.config = config or ProgressiveSamplingConfig()
652
+
653
+ async def _do_sample(
654
+ self,
655
+ context: SamplingContext,
656
+ data: Any,
657
+ ) -> SamplingOutput:
658
+ """Perform progressive sampling."""
659
+ import polars as pl
660
+
661
+ target_rows = context.config.target_rows
662
+ seed = context.config.seed or 42
663
+
664
+ # Collect data
665
+ df = data.collect() if hasattr(data, "collect") else data
666
+ total_rows = len(df)
667
+
668
+ # Initial sample size
669
+ current_size = int(total_rows * self.config.initial_sample_ratio)
670
+ current_size = max(current_size, 1000)
671
+
672
+ # Track estimates for convergence check
673
+ prev_estimates: dict[str, float] = {}
674
+ stages_completed = 0
675
+ converged = False
676
+
677
+ for stage in range(self.config.max_stages):
678
+ # Sample current size
679
+ sample = df.sample(n=min(current_size, total_rows), seed=seed + stage)
680
+ stages_completed = stage + 1
681
+
682
+ # Calculate summary statistics for convergence check
683
+ numeric_cols = sample.select(pl.selectors.numeric()).columns
684
+ if numeric_cols:
685
+ estimates = {}
686
+ for col in numeric_cols[:5]: # Check first 5 numeric columns
687
+ mean = sample[col].mean()
688
+ if mean is not None:
689
+ estimates[col] = float(mean)
690
+
691
+ # Check convergence
692
+ if prev_estimates:
693
+ max_change = 0.0
694
+ for col, val in estimates.items():
695
+ if col in prev_estimates and prev_estimates[col] != 0:
696
+ change = abs(val - prev_estimates[col]) / abs(prev_estimates[col])
697
+ max_change = max(max_change, change)
698
+
699
+ if max_change < self.config.convergence_threshold:
700
+ converged = True
701
+ logger.info(f"Converged at stage {stage + 1} with change {max_change:.4f}")
702
+ break
703
+
704
+ prev_estimates = estimates
705
+
706
+ # Check if reached target
707
+ if current_size >= target_rows:
708
+ break
709
+
710
+ # Grow sample size
711
+ current_size = int(current_size * self.config.growth_factor)
712
+ current_size = min(current_size, target_rows)
713
+
714
+ # Final sample at target size
715
+ final_sample = df.sample(n=min(target_rows, total_rows), seed=seed)
716
+
717
+ return SamplingOutput(
718
+ sampled_data=final_sample.lazy(),
719
+ sampled_rows=len(final_sample),
720
+ stages_completed=stages_completed,
721
+ converged_early=converged,
722
+ )
723
+
724
+
725
+ # ============================================================================
726
+ # Strategy Factory
727
+ # ============================================================================
728
+
729
+
730
+ class SamplingStrategyFactory:
731
+ """Factory for creating sampling strategies."""
732
+
733
+ _strategies: dict[EnterpriseSamplingStrategy, type[BaseSamplingStrategy]] = {
734
+ EnterpriseSamplingStrategy.NONE: NoSamplingStrategy,
735
+ EnterpriseSamplingStrategy.BLOCK: BlockSamplingStrategy,
736
+ EnterpriseSamplingStrategy.MULTI_STAGE: MultiStageSamplingStrategy,
737
+ EnterpriseSamplingStrategy.COLUMN_AWARE: ColumnAwareSamplingStrategy,
738
+ EnterpriseSamplingStrategy.PROGRESSIVE: ProgressiveSamplingStrategy,
739
+ }
740
+
741
+ @classmethod
742
+ def create(
743
+ cls,
744
+ strategy: EnterpriseSamplingStrategy,
745
+ config: EnterpriseSamplingRequest,
746
+ ) -> BaseSamplingStrategy:
747
+ """Create a sampling strategy instance.
748
+
749
+ Args:
750
+ strategy: Strategy type to create.
751
+ config: Sampling configuration.
752
+
753
+ Returns:
754
+ Strategy instance.
755
+ """
756
+ strategy_class = cls._strategies.get(strategy)
757
+
758
+ if strategy_class is None:
759
+ # Fall back to adaptive selection
760
+ logger.warning(f"Strategy {strategy} not found, using column-aware")
761
+ strategy_class = ColumnAwareSamplingStrategy
762
+
763
+ # Pass strategy-specific config if available
764
+ if strategy == EnterpriseSamplingStrategy.BLOCK and config.block_config:
765
+ return BlockSamplingStrategy(config.block_config)
766
+ elif strategy == EnterpriseSamplingStrategy.MULTI_STAGE and config.multi_stage_config:
767
+ return MultiStageSamplingStrategy(config.multi_stage_config)
768
+ elif strategy == EnterpriseSamplingStrategy.COLUMN_AWARE and config.column_aware_config:
769
+ return ColumnAwareSamplingStrategy(config.column_aware_config)
770
+ elif strategy == EnterpriseSamplingStrategy.PROGRESSIVE and config.progressive_config:
771
+ return ProgressiveSamplingStrategy(config.progressive_config)
772
+
773
+ return strategy_class()
774
+
775
+ @classmethod
776
+ def register(
777
+ cls,
778
+ strategy_type: EnterpriseSamplingStrategy,
779
+ strategy_class: type[BaseSamplingStrategy],
780
+ ) -> None:
781
+ """Register a custom sampling strategy.
782
+
783
+ Args:
784
+ strategy_type: Strategy identifier.
785
+ strategy_class: Strategy class.
786
+ """
787
+ cls._strategies[strategy_type] = strategy_class
788
+
789
+
790
+ # ============================================================================
791
+ # Enterprise Scale Sampler (Orchestrator)
792
+ # ============================================================================
793
+
794
+
795
+ class EnterpriseScaleSampler:
796
+ """Main orchestrator for enterprise-scale sampling.
797
+
798
+ Auto-selects the best sampling strategy based on dataset scale
799
+ and executes sampling with full observability.
800
+
801
+ Example:
802
+ sampler = EnterpriseScaleSampler()
803
+ response = await sampler.sample(source_id, config)
804
+ """
805
+
806
+ def __init__(self) -> None:
807
+ self._active_jobs: dict[str, SamplingJobStatus] = {}
808
+
809
+ async def sample(
810
+ self,
811
+ config: EnterpriseSamplingRequest,
812
+ data: Any,
813
+ row_count: int,
814
+ column_count: int,
815
+ ) -> EnterpriseSamplingResponse:
816
+ """Execute enterprise-scale sampling.
817
+
818
+ Args:
819
+ config: Sampling configuration.
820
+ data: Input data (Polars LazyFrame).
821
+ row_count: Total row count.
822
+ column_count: Total column count.
823
+
824
+ Returns:
825
+ EnterpriseSamplingResponse with results.
826
+ """
827
+ job_id = str(uuid.uuid4())
828
+ started_at = datetime.utcnow()
829
+
830
+ # Classify scale
831
+ scale = classify_dataset_scale(row_count)
832
+
833
+ # Create context
834
+ context = SamplingContext(
835
+ source_id=config.source_id,
836
+ job_id=job_id,
837
+ config=config,
838
+ row_count=row_count,
839
+ column_count=column_count,
840
+ scale_category=scale,
841
+ )
842
+
843
+ # Track job
844
+ self._active_jobs[job_id] = SamplingJobStatus(
845
+ job_id=job_id,
846
+ source_id=config.source_id,
847
+ status="running",
848
+ progress=0.0,
849
+ current_stage="initializing",
850
+ started_at=started_at,
851
+ )
852
+
853
+ try:
854
+ # Select strategy
855
+ strategy_type = self._select_strategy(config, scale)
856
+
857
+ # Create strategy
858
+ strategy = SamplingStrategyFactory.create(strategy_type, config)
859
+
860
+ # Execute sampling
861
+ output = await strategy.sample(context, data)
862
+
863
+ # Build metrics
864
+ metrics = SamplingMetrics(
865
+ original_rows=row_count,
866
+ sampled_rows=output.sampled_rows,
867
+ sampling_ratio=output.sampled_rows / row_count if row_count > 0 else 1.0,
868
+ strategy_used=strategy.strategy_type,
869
+ scale_category=scale,
870
+ is_sampled=output.sampled_rows < row_count,
871
+ sampling_time_ms=context.elapsed_ms(),
872
+ throughput_rows_per_sec=row_count / (context.elapsed_ms() / 1000) if context.elapsed_ms() > 0 else 0,
873
+ speedup_factor=row_count / output.sampled_rows if output.sampled_rows > 0 else 1.0,
874
+ peak_memory_mb=context.peak_memory_mb,
875
+ workers_used=config.block_config.parallel.max_workers if config.block_config else 1,
876
+ blocks_processed=output.blocks_processed,
877
+ stages_completed=output.stages_completed,
878
+ converged_early=output.converged_early,
879
+ backpressure_events=context.backpressure_events,
880
+ )
881
+
882
+ # Update job status
883
+ self._active_jobs[job_id].status = "completed"
884
+ self._active_jobs[job_id].progress = 1.0
885
+
886
+ return EnterpriseSamplingResponse(
887
+ source_id=config.source_id,
888
+ job_id=job_id,
889
+ status="completed",
890
+ started_at=started_at,
891
+ completed_at=datetime.utcnow(),
892
+ metrics=metrics,
893
+ sampled_data_path=output.output_path,
894
+ )
895
+
896
+ except Exception as e:
897
+ logger.error(f"Sampling failed for job {job_id}: {e}")
898
+
899
+ self._active_jobs[job_id].status = "failed"
900
+
901
+ return EnterpriseSamplingResponse(
902
+ source_id=config.source_id,
903
+ job_id=job_id,
904
+ status="failed",
905
+ started_at=started_at,
906
+ completed_at=datetime.utcnow(),
907
+ error_message=str(e),
908
+ )
909
+
910
+ def _select_strategy(
911
+ self,
912
+ config: EnterpriseSamplingRequest,
913
+ scale: ScaleCategory,
914
+ ) -> EnterpriseSamplingStrategy:
915
+ """Select best sampling strategy.
916
+
917
+ Args:
918
+ config: Sampling configuration.
919
+ scale: Dataset scale category.
920
+
921
+ Returns:
922
+ Selected strategy type.
923
+ """
924
+ # If explicitly specified, use it
925
+ if config.strategy != EnterpriseSamplingStrategy.ADAPTIVE:
926
+ return config.strategy
927
+
928
+ # Auto-select based on scale
929
+ return SCALE_STRATEGY_MAP.get(scale, EnterpriseSamplingStrategy.COLUMN_AWARE)
930
+
931
+ def get_job_status(self, job_id: str) -> SamplingJobStatus | None:
932
+ """Get status of a sampling job.
933
+
934
+ Args:
935
+ job_id: Job identifier.
936
+
937
+ Returns:
938
+ Job status or None if not found.
939
+ """
940
+ return self._active_jobs.get(job_id)
941
+
942
+ def list_jobs(self) -> list[SamplingJobStatus]:
943
+ """List all sampling jobs.
944
+
945
+ Returns:
946
+ List of job statuses.
947
+ """
948
+ return list(self._active_jobs.values())
949
+
950
+
951
+ # ============================================================================
952
+ # Sample Size Estimator
953
+ # ============================================================================
954
+
955
+
956
+ class SampleSizeEstimator:
957
+ """Estimates optimal sample sizes and provides recommendations."""
958
+
959
+ def estimate(self, request: SampleSizeEstimateRequest) -> SampleSizeEstimateResponse:
960
+ """Estimate optimal sample size.
961
+
962
+ Args:
963
+ request: Estimation request.
964
+
965
+ Returns:
966
+ Estimation response with recommendations.
967
+ """
968
+ population_size = request.population_size
969
+ scale = classify_dataset_scale(population_size)
970
+
971
+ # Apply quality preset
972
+ preset = QUALITY_PRESETS.get(request.quality, QUALITY_PRESETS[SamplingQuality.STANDARD])
973
+
974
+ # Calculate sample size using Cochran's formula
975
+ recommended = calculate_cochran_sample_size(
976
+ population_size=population_size,
977
+ confidence_level=request.confidence_level,
978
+ margin_of_error=request.margin_of_error,
979
+ )
980
+
981
+ # Apply preset target if specified
982
+ if preset["target_rows"] is not None:
983
+ recommended = max(recommended, preset["target_rows"])
984
+
985
+ # Calculate bounds
986
+ min_size = max(recommended // 2, 100)
987
+ max_size = min(recommended * 10, population_size)
988
+
989
+ # Get recommended strategy
990
+ strategy = SCALE_STRATEGY_MAP.get(scale, EnterpriseSamplingStrategy.COLUMN_AWARE)
991
+
992
+ # Estimate time and memory
993
+ estimated_time = estimate_processing_time(population_size, strategy)
994
+ estimated_memory = estimate_memory_usage(population_size, 50, strategy) # Assume 50 columns
995
+
996
+ # Calculate speedup
997
+ speedup = population_size / recommended if recommended > 0 else 1.0
998
+
999
+ # Build rationale
1000
+ rationale = self._build_rationale(scale, strategy, population_size)
1001
+
1002
+ return SampleSizeEstimateResponse(
1003
+ population_size=population_size,
1004
+ scale_category=scale,
1005
+ recommended_size=recommended,
1006
+ min_size=min_size,
1007
+ max_size=max_size,
1008
+ estimated_time_seconds=estimated_time,
1009
+ estimated_memory_mb=estimated_memory,
1010
+ speedup_factor=speedup,
1011
+ recommended_strategy=strategy,
1012
+ strategy_rationale=rationale,
1013
+ )
1014
+
1015
+ def _build_rationale(
1016
+ self,
1017
+ scale: ScaleCategory,
1018
+ strategy: EnterpriseSamplingStrategy,
1019
+ population_size: int,
1020
+ ) -> str:
1021
+ """Build rationale for strategy recommendation."""
1022
+ rationales = {
1023
+ ScaleCategory.SMALL: "Dataset is small enough for full scan without sampling.",
1024
+ ScaleCategory.MEDIUM: "Column-aware sampling adapts to data types for optimal accuracy.",
1025
+ ScaleCategory.LARGE: "Block sampling ensures even coverage across the dataset with parallel processing.",
1026
+ ScaleCategory.XLARGE: "Multi-stage sampling efficiently reduces billion-row datasets through hierarchical processing.",
1027
+ ScaleCategory.XXLARGE: "Multi-stage sampling with probabilistic sketches for extreme-scale datasets.",
1028
+ }
1029
+ return rationales.get(scale, "Adaptive sampling based on data characteristics.")
1030
+
1031
+
1032
+ # ============================================================================
1033
+ # Sketch Estimator (Probabilistic Data Structures)
1034
+ # ============================================================================
1035
+
1036
+
1037
+ class SketchEstimator:
1038
+ """Estimates using truthound probabilistic data structures for 10B+ row datasets.
1039
+
1040
+ Uses truthound.profiler.sketches for O(1) memory aggregations:
1041
+ - HyperLogLog: Cardinality estimation (±0.41% error at precision=14)
1042
+ - CountMinSketch: Frequency estimation and heavy hitters detection
1043
+ - BloomFilter: Membership testing with configurable false positive rate
1044
+ """
1045
+
1046
+ async def estimate(self, request: SketchEstimateRequest, data: Any) -> SketchEstimateResponse:
1047
+ """Run sketch-based estimation.
1048
+
1049
+ Args:
1050
+ request: Sketch estimation request.
1051
+ data: Input data.
1052
+
1053
+ Returns:
1054
+ Sketch estimation response.
1055
+ """
1056
+ start_time = time.time()
1057
+ results: list[SketchEstimateResult] = []
1058
+ total_memory = 0
1059
+
1060
+ config = request.sketch_config or SketchConfig()
1061
+
1062
+ for column in request.columns:
1063
+ col_start = time.time()
1064
+
1065
+ if config.sketch_type == SketchType.HYPERLOGLOG:
1066
+ result = await self._estimate_cardinality(column, data, config)
1067
+ elif config.sketch_type == SketchType.COUNTMIN:
1068
+ result = await self._estimate_frequency(column, data, config)
1069
+ else:
1070
+ result = await self._test_membership(column, data, config)
1071
+
1072
+ result.processing_time_ms = (time.time() - col_start) * 1000
1073
+ results.append(result)
1074
+ total_memory += result.memory_used_bytes
1075
+
1076
+ return SketchEstimateResponse(
1077
+ source_id=request.source_id,
1078
+ results=results,
1079
+ total_time_ms=(time.time() - start_time) * 1000,
1080
+ total_memory_mb=total_memory / (1024 * 1024),
1081
+ )
1082
+
1083
+ async def _estimate_cardinality(
1084
+ self,
1085
+ column: str,
1086
+ data: Any,
1087
+ config: SketchConfig,
1088
+ ) -> SketchEstimateResult:
1089
+ """Estimate cardinality using truthound's HyperLogLog."""
1090
+ df = data.collect() if hasattr(data, "collect") else data
1091
+
1092
+ try:
1093
+ from truthound.profiler.sketches import HyperLogLog, HyperLogLogConfig
1094
+
1095
+ # Create HyperLogLog with specified precision
1096
+ hll_config = HyperLogLogConfig(precision=config.hll_precision)
1097
+ hll = HyperLogLog(hll_config)
1098
+
1099
+ # Add values in batches for efficiency
1100
+ column_values = df[column].drop_nulls().to_list()
1101
+ hll.add_batch(column_values)
1102
+
1103
+ # Get estimate and error
1104
+ cardinality_estimate = hll.estimate()
1105
+ cardinality_error = hll.standard_error()
1106
+
1107
+ # Calculate memory usage
1108
+ memory_bytes = (2 ** config.hll_precision) * 6 // 8
1109
+
1110
+ return SketchEstimateResult(
1111
+ column=column,
1112
+ sketch_type=SketchType.HYPERLOGLOG,
1113
+ cardinality_estimate=cardinality_estimate,
1114
+ cardinality_error=cardinality_error,
1115
+ memory_used_bytes=memory_bytes,
1116
+ processing_time_ms=0.0,
1117
+ )
1118
+
1119
+ except ImportError:
1120
+ logger.warning("truthound.profiler.sketches not available, using fallback")
1121
+ # Fallback to Polars n_unique
1122
+ unique_count = df[column].n_unique()
1123
+ error = 1.04 / math.sqrt(2 ** config.hll_precision)
1124
+ memory_bytes = (2 ** config.hll_precision) * 6 // 8
1125
+
1126
+ return SketchEstimateResult(
1127
+ column=column,
1128
+ sketch_type=SketchType.HYPERLOGLOG,
1129
+ cardinality_estimate=unique_count,
1130
+ cardinality_error=error,
1131
+ memory_used_bytes=memory_bytes,
1132
+ processing_time_ms=0.0,
1133
+ )
1134
+
1135
+ async def _estimate_frequency(
1136
+ self,
1137
+ column: str,
1138
+ data: Any,
1139
+ config: SketchConfig,
1140
+ ) -> SketchEstimateResult:
1141
+ """Estimate frequencies using truthound's Count-Min Sketch."""
1142
+ import polars as pl
1143
+
1144
+ df = data.collect() if hasattr(data, "collect") else data
1145
+
1146
+ try:
1147
+ from truthound.profiler.sketches import CountMinSketch, CountMinSketchConfig
1148
+
1149
+ # Create Count-Min Sketch with specified dimensions
1150
+ cms_config = CountMinSketchConfig(
1151
+ width=config.cms_width,
1152
+ depth=config.cms_depth,
1153
+ )
1154
+ cms = CountMinSketch(cms_config)
1155
+
1156
+ # Add all values
1157
+ column_values = df[column].drop_nulls().to_list()
1158
+ for value in column_values:
1159
+ cms.add(value)
1160
+
1161
+ # Get heavy hitters (items appearing in >1% of stream)
1162
+ heavy_hitters_raw = cms.get_heavy_hitters(threshold=0.01)
1163
+ heavy_hitters = [
1164
+ {"value": str(item), "count": count}
1165
+ for item, count in heavy_hitters_raw[:10]
1166
+ ]
1167
+
1168
+ # Memory = width * depth * 4 bytes (32-bit counters)
1169
+ memory_bytes = config.cms_width * config.cms_depth * 4
1170
+
1171
+ return SketchEstimateResult(
1172
+ column=column,
1173
+ sketch_type=SketchType.COUNTMIN,
1174
+ heavy_hitters=heavy_hitters,
1175
+ memory_used_bytes=memory_bytes,
1176
+ processing_time_ms=0.0,
1177
+ )
1178
+
1179
+ except ImportError:
1180
+ logger.warning("truthound.profiler.sketches not available, using fallback")
1181
+ # Fallback to Polars group_by
1182
+ value_counts = (
1183
+ df.group_by(column)
1184
+ .agg(pl.len().alias("count"))
1185
+ .sort("count", descending=True)
1186
+ .head(10)
1187
+ )
1188
+
1189
+ heavy_hitters = [
1190
+ {"value": str(row[column]), "count": row["count"]}
1191
+ for row in value_counts.iter_rows(named=True)
1192
+ ]
1193
+
1194
+ memory_bytes = config.cms_width * config.cms_depth * 4
1195
+
1196
+ return SketchEstimateResult(
1197
+ column=column,
1198
+ sketch_type=SketchType.COUNTMIN,
1199
+ heavy_hitters=heavy_hitters,
1200
+ memory_used_bytes=memory_bytes,
1201
+ processing_time_ms=0.0,
1202
+ )
1203
+
1204
+ async def _test_membership(
1205
+ self,
1206
+ column: str,
1207
+ data: Any,
1208
+ config: SketchConfig,
1209
+ ) -> SketchEstimateResult:
1210
+ """Test membership using truthound's Bloom Filter."""
1211
+ df = data.collect() if hasattr(data, "collect") else data
1212
+
1213
+ try:
1214
+ from truthound.profiler.sketches import BloomFilter, BloomFilterConfig
1215
+
1216
+ # Create Bloom Filter with specified capacity and error rate
1217
+ bf_config = BloomFilterConfig(
1218
+ capacity=config.bloom_capacity,
1219
+ error_rate=config.bloom_error_rate,
1220
+ )
1221
+ bf = BloomFilter(bf_config)
1222
+
1223
+ # Add all values
1224
+ column_values = df[column].drop_nulls().to_list()
1225
+ for value in column_values:
1226
+ bf.add(value)
1227
+
1228
+ # Get current false positive rate
1229
+ actual_fp_rate = bf.false_positive_rate()
1230
+
1231
+ # Calculate memory usage
1232
+ m = -config.bloom_capacity * math.log(config.bloom_error_rate) / (math.log(2) ** 2)
1233
+ memory_bytes = int(m / 8)
1234
+
1235
+ return SketchEstimateResult(
1236
+ column=column,
1237
+ sketch_type=SketchType.BLOOM,
1238
+ membership_tests={
1239
+ "items_added": len(column_values),
1240
+ "false_positive_rate": actual_fp_rate,
1241
+ },
1242
+ memory_used_bytes=memory_bytes,
1243
+ processing_time_ms=0.0,
1244
+ )
1245
+
1246
+ except ImportError:
1247
+ logger.warning("truthound.profiler.sketches not available, using fallback")
1248
+ # Fallback: just calculate memory requirements
1249
+ m = -config.bloom_capacity * math.log(config.bloom_error_rate) / (math.log(2) ** 2)
1250
+ memory_bytes = int(m / 8)
1251
+
1252
+ return SketchEstimateResult(
1253
+ column=column,
1254
+ sketch_type=SketchType.BLOOM,
1255
+ membership_tests={},
1256
+ memory_used_bytes=memory_bytes,
1257
+ processing_time_ms=0.0,
1258
+ )
1259
+
1260
+
1261
+ # ============================================================================
1262
+ # Singleton Instance
1263
+ # ============================================================================
1264
+
1265
+ _sampler: EnterpriseScaleSampler | None = None
1266
+ _estimator: SampleSizeEstimator | None = None
1267
+ _sketch_estimator: SketchEstimator | None = None
1268
+
1269
+
1270
+ def get_enterprise_sampler() -> EnterpriseScaleSampler:
1271
+ """Get enterprise sampler singleton."""
1272
+ global _sampler
1273
+ if _sampler is None:
1274
+ _sampler = EnterpriseScaleSampler()
1275
+ return _sampler
1276
+
1277
+
1278
+ def get_sample_size_estimator() -> SampleSizeEstimator:
1279
+ """Get sample size estimator singleton."""
1280
+ global _estimator
1281
+ if _estimator is None:
1282
+ _estimator = SampleSizeEstimator()
1283
+ return _estimator
1284
+
1285
+
1286
+ def get_sketch_estimator() -> SketchEstimator:
1287
+ """Get sketch estimator singleton."""
1288
+ global _sketch_estimator
1289
+ if _sketch_estimator is None:
1290
+ _sketch_estimator = SketchEstimator()
1291
+ return _sketch_estimator