truthound-dashboard 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +75 -86
- truthound_dashboard/api/anomaly.py +7 -13
- truthound_dashboard/api/cross_alerts.py +38 -52
- truthound_dashboard/api/drift.py +49 -59
- truthound_dashboard/api/drift_monitor.py +234 -79
- truthound_dashboard/api/enterprise_sampling.py +498 -0
- truthound_dashboard/api/history.py +57 -5
- truthound_dashboard/api/lineage.py +3 -48
- truthound_dashboard/api/maintenance.py +104 -49
- truthound_dashboard/api/mask.py +1 -2
- truthound_dashboard/api/middleware.py +2 -1
- truthound_dashboard/api/model_monitoring.py +435 -311
- truthound_dashboard/api/notifications.py +227 -191
- truthound_dashboard/api/notifications_advanced.py +21 -20
- truthound_dashboard/api/observability.py +586 -0
- truthound_dashboard/api/plugins.py +2 -433
- truthound_dashboard/api/profile.py +199 -37
- truthound_dashboard/api/quality_reporter.py +701 -0
- truthound_dashboard/api/reports.py +7 -16
- truthound_dashboard/api/router.py +66 -0
- truthound_dashboard/api/rule_suggestions.py +5 -5
- truthound_dashboard/api/scan.py +17 -19
- truthound_dashboard/api/schedules.py +85 -50
- truthound_dashboard/api/schema_evolution.py +6 -6
- truthound_dashboard/api/schema_watcher.py +667 -0
- truthound_dashboard/api/sources.py +98 -27
- truthound_dashboard/api/tiering.py +1323 -0
- truthound_dashboard/api/triggers.py +14 -11
- truthound_dashboard/api/validations.py +12 -11
- truthound_dashboard/api/versioning.py +1 -6
- truthound_dashboard/core/__init__.py +129 -3
- truthound_dashboard/core/actions/__init__.py +62 -0
- truthound_dashboard/core/actions/custom.py +426 -0
- truthound_dashboard/core/actions/notifications.py +910 -0
- truthound_dashboard/core/actions/storage.py +472 -0
- truthound_dashboard/core/actions/webhook.py +281 -0
- truthound_dashboard/core/anomaly.py +262 -67
- truthound_dashboard/core/anomaly_explainer.py +4 -3
- truthound_dashboard/core/backends/__init__.py +67 -0
- truthound_dashboard/core/backends/base.py +299 -0
- truthound_dashboard/core/backends/errors.py +191 -0
- truthound_dashboard/core/backends/factory.py +423 -0
- truthound_dashboard/core/backends/mock_backend.py +451 -0
- truthound_dashboard/core/backends/truthound_backend.py +718 -0
- truthound_dashboard/core/checkpoint/__init__.py +87 -0
- truthound_dashboard/core/checkpoint/adapters.py +814 -0
- truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
- truthound_dashboard/core/checkpoint/runner.py +270 -0
- truthound_dashboard/core/connections.py +437 -10
- truthound_dashboard/core/converters/__init__.py +14 -0
- truthound_dashboard/core/converters/truthound.py +620 -0
- truthound_dashboard/core/cross_alerts.py +540 -320
- truthound_dashboard/core/datasource_factory.py +1672 -0
- truthound_dashboard/core/drift_monitor.py +216 -20
- truthound_dashboard/core/enterprise_sampling.py +1291 -0
- truthound_dashboard/core/interfaces/__init__.py +225 -0
- truthound_dashboard/core/interfaces/actions.py +652 -0
- truthound_dashboard/core/interfaces/base.py +247 -0
- truthound_dashboard/core/interfaces/checkpoint.py +676 -0
- truthound_dashboard/core/interfaces/protocols.py +664 -0
- truthound_dashboard/core/interfaces/reporters.py +650 -0
- truthound_dashboard/core/interfaces/routing.py +646 -0
- truthound_dashboard/core/interfaces/triggers.py +619 -0
- truthound_dashboard/core/lineage.py +407 -71
- truthound_dashboard/core/model_monitoring.py +431 -3
- truthound_dashboard/core/notifications/base.py +4 -0
- truthound_dashboard/core/notifications/channels.py +501 -1203
- truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
- truthound_dashboard/core/notifications/deduplication/service.py +131 -348
- truthound_dashboard/core/notifications/dispatcher.py +202 -11
- truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
- truthound_dashboard/core/notifications/escalation/engine.py +168 -358
- truthound_dashboard/core/notifications/routing/__init__.py +88 -128
- truthound_dashboard/core/notifications/routing/engine.py +90 -317
- truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
- truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
- truthound_dashboard/core/notifications/throttling/builder.py +117 -255
- truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
- truthound_dashboard/core/phase5/collaboration.py +1 -1
- truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
- truthound_dashboard/core/quality_reporter.py +1359 -0
- truthound_dashboard/core/report_history.py +0 -6
- truthound_dashboard/core/reporters/__init__.py +175 -14
- truthound_dashboard/core/reporters/adapters.py +943 -0
- truthound_dashboard/core/reporters/base.py +0 -3
- truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
- truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
- truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
- truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
- truthound_dashboard/core/reporters/compat.py +266 -0
- truthound_dashboard/core/reporters/csv_reporter.py +2 -35
- truthound_dashboard/core/reporters/factory.py +526 -0
- truthound_dashboard/core/reporters/interfaces.py +745 -0
- truthound_dashboard/core/reporters/registry.py +1 -10
- truthound_dashboard/core/scheduler.py +165 -0
- truthound_dashboard/core/schema_evolution.py +3 -3
- truthound_dashboard/core/schema_watcher.py +1528 -0
- truthound_dashboard/core/services.py +595 -76
- truthound_dashboard/core/store_manager.py +810 -0
- truthound_dashboard/core/streaming_anomaly.py +169 -4
- truthound_dashboard/core/tiering.py +1309 -0
- truthound_dashboard/core/triggers/evaluators.py +178 -8
- truthound_dashboard/core/truthound_adapter.py +2620 -197
- truthound_dashboard/core/unified_alerts.py +23 -20
- truthound_dashboard/db/__init__.py +8 -0
- truthound_dashboard/db/database.py +8 -2
- truthound_dashboard/db/models.py +944 -25
- truthound_dashboard/db/repository.py +2 -0
- truthound_dashboard/main.py +11 -0
- truthound_dashboard/schemas/__init__.py +177 -16
- truthound_dashboard/schemas/base.py +44 -23
- truthound_dashboard/schemas/collaboration.py +19 -6
- truthound_dashboard/schemas/cross_alerts.py +19 -3
- truthound_dashboard/schemas/drift.py +61 -55
- truthound_dashboard/schemas/drift_monitor.py +67 -23
- truthound_dashboard/schemas/enterprise_sampling.py +653 -0
- truthound_dashboard/schemas/lineage.py +0 -33
- truthound_dashboard/schemas/mask.py +10 -8
- truthound_dashboard/schemas/model_monitoring.py +89 -10
- truthound_dashboard/schemas/notifications_advanced.py +13 -0
- truthound_dashboard/schemas/observability.py +453 -0
- truthound_dashboard/schemas/plugins.py +0 -280
- truthound_dashboard/schemas/profile.py +154 -247
- truthound_dashboard/schemas/quality_reporter.py +403 -0
- truthound_dashboard/schemas/reports.py +2 -2
- truthound_dashboard/schemas/rule_suggestion.py +8 -1
- truthound_dashboard/schemas/scan.py +4 -24
- truthound_dashboard/schemas/schedule.py +11 -3
- truthound_dashboard/schemas/schema_watcher.py +727 -0
- truthound_dashboard/schemas/source.py +17 -2
- truthound_dashboard/schemas/tiering.py +822 -0
- truthound_dashboard/schemas/triggers.py +16 -0
- truthound_dashboard/schemas/unified_alerts.py +7 -0
- truthound_dashboard/schemas/validation.py +0 -13
- truthound_dashboard/schemas/validators/base.py +41 -21
- truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
- truthound_dashboard/schemas/validators/localization_validators.py +273 -0
- truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
- truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
- truthound_dashboard/schemas/validators/referential_validators.py +312 -0
- truthound_dashboard/schemas/validators/registry.py +93 -8
- truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
- truthound_dashboard/schemas/versioning.py +1 -6
- truthound_dashboard/static/index.html +2 -2
- truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
- {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
- truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
- truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
- truthound_dashboard/core/plugins/hooks/manager.py +0 -403
- truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
- truthound_dashboard/core/reporters/junit_reporter.py +0 -233
- truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
- truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
- truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
- truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
- truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
- truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
- truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
- truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
- truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
- truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
- truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
- truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
- truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
- truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
- truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
- truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
- truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
- truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
- truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
- truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
- truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
- truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
- truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
- truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
- truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
- truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
- truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
- truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
- truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
- truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
- truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
- truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
- truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
- truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
- truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
- truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
- truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
- truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
- truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
- truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
- truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
- truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
- truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
- truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
- truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
- truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
- truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
- truthound_dashboard-1.4.3.dist-info/METADATA +0 -505
- {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1291 @@
|
|
|
1
|
+
"""Enterprise-scale sampling strategies for large datasets.
|
|
2
|
+
|
|
3
|
+
This module provides the core business logic for truthound 1.2.10's enterprise
|
|
4
|
+
sampling capabilities, supporting datasets from 100M to billions of rows.
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
- Strategy Pattern: Each sampling method is a separate strategy class
|
|
8
|
+
- Factory Pattern: SamplerFactory creates appropriate sampler based on scale
|
|
9
|
+
- Template Method: Base class defines sampling workflow, strategies implement specifics
|
|
10
|
+
|
|
11
|
+
Strategies:
|
|
12
|
+
1. BlockSamplingStrategy: Divides data into blocks, samples proportionally
|
|
13
|
+
2. MultiStageSamplingStrategy: Hierarchical sampling in multiple passes
|
|
14
|
+
3. ColumnAwareSamplingStrategy: Adjusts sampling based on column types
|
|
15
|
+
4. ProgressiveSamplingStrategy: Iterative sampling until convergence
|
|
16
|
+
5. EnterpriseScaleSampler: Orchestrator that auto-selects best strategy
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
from truthound_dashboard.core.enterprise_sampling import (
|
|
20
|
+
EnterpriseScaleSampler,
|
|
21
|
+
classify_dataset_scale,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
sampler = EnterpriseScaleSampler()
|
|
25
|
+
result = await sampler.sample(source_id, config)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import asyncio
|
|
31
|
+
import logging
|
|
32
|
+
import math
|
|
33
|
+
import time
|
|
34
|
+
import uuid
|
|
35
|
+
from abc import ABC, abstractmethod
|
|
36
|
+
from dataclasses import dataclass, field
|
|
37
|
+
from datetime import datetime
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
from truthound_dashboard.schemas.enterprise_sampling import (
|
|
42
|
+
BlockSamplingConfig,
|
|
43
|
+
ColumnAwareSamplingConfig,
|
|
44
|
+
EnterpriseSamplingRequest,
|
|
45
|
+
EnterpriseSamplingResponse,
|
|
46
|
+
EnterpriseSamplingStrategy,
|
|
47
|
+
MemoryBudgetConfig,
|
|
48
|
+
MultiStageSamplingConfig,
|
|
49
|
+
ParallelSamplingConfig,
|
|
50
|
+
ProgressiveSamplingConfig,
|
|
51
|
+
SampleSizeEstimateRequest,
|
|
52
|
+
SampleSizeEstimateResponse,
|
|
53
|
+
SamplingJobStatus,
|
|
54
|
+
SamplingMetrics,
|
|
55
|
+
SamplingQuality,
|
|
56
|
+
ScaleCategory,
|
|
57
|
+
SchedulingPolicy,
|
|
58
|
+
SketchConfig,
|
|
59
|
+
SketchEstimateRequest,
|
|
60
|
+
SketchEstimateResponse,
|
|
61
|
+
SketchEstimateResult,
|
|
62
|
+
SketchType,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
logger = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ============================================================================
|
|
69
|
+
# Constants
|
|
70
|
+
# ============================================================================
|
|
71
|
+
|
|
72
|
+
# Scale category thresholds
|
|
73
|
+
SCALE_THRESHOLDS = {
|
|
74
|
+
ScaleCategory.SMALL: 1_000_000,
|
|
75
|
+
ScaleCategory.MEDIUM: 10_000_000,
|
|
76
|
+
ScaleCategory.LARGE: 100_000_000,
|
|
77
|
+
ScaleCategory.XLARGE: 1_000_000_000,
|
|
78
|
+
# XXLARGE: > 1B
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Quality preset configurations
|
|
82
|
+
QUALITY_PRESETS = {
|
|
83
|
+
SamplingQuality.SKETCH: {
|
|
84
|
+
"target_rows": 10_000,
|
|
85
|
+
"confidence_level": 0.80,
|
|
86
|
+
"margin_of_error": 0.10,
|
|
87
|
+
},
|
|
88
|
+
SamplingQuality.QUICK: {
|
|
89
|
+
"target_rows": 50_000,
|
|
90
|
+
"confidence_level": 0.90,
|
|
91
|
+
"margin_of_error": 0.05,
|
|
92
|
+
},
|
|
93
|
+
SamplingQuality.STANDARD: {
|
|
94
|
+
"target_rows": 100_000,
|
|
95
|
+
"confidence_level": 0.95,
|
|
96
|
+
"margin_of_error": 0.05,
|
|
97
|
+
},
|
|
98
|
+
SamplingQuality.HIGH: {
|
|
99
|
+
"target_rows": 500_000,
|
|
100
|
+
"confidence_level": 0.99,
|
|
101
|
+
"margin_of_error": 0.03,
|
|
102
|
+
},
|
|
103
|
+
SamplingQuality.EXACT: {
|
|
104
|
+
"target_rows": None, # Full scan
|
|
105
|
+
"confidence_level": 1.0,
|
|
106
|
+
"margin_of_error": 0.0,
|
|
107
|
+
},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Strategy recommendations by scale
|
|
111
|
+
SCALE_STRATEGY_MAP = {
|
|
112
|
+
ScaleCategory.SMALL: EnterpriseSamplingStrategy.NONE,
|
|
113
|
+
ScaleCategory.MEDIUM: EnterpriseSamplingStrategy.COLUMN_AWARE,
|
|
114
|
+
ScaleCategory.LARGE: EnterpriseSamplingStrategy.BLOCK,
|
|
115
|
+
ScaleCategory.XLARGE: EnterpriseSamplingStrategy.MULTI_STAGE,
|
|
116
|
+
ScaleCategory.XXLARGE: EnterpriseSamplingStrategy.MULTI_STAGE,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ============================================================================
|
|
121
|
+
# Utility Functions
|
|
122
|
+
# ============================================================================
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def classify_dataset_scale(row_count: int) -> ScaleCategory:
|
|
126
|
+
"""Classify dataset by scale category.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
row_count: Number of rows in dataset.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
ScaleCategory enum value.
|
|
133
|
+
"""
|
|
134
|
+
if row_count < SCALE_THRESHOLDS[ScaleCategory.SMALL]:
|
|
135
|
+
return ScaleCategory.SMALL
|
|
136
|
+
elif row_count < SCALE_THRESHOLDS[ScaleCategory.MEDIUM]:
|
|
137
|
+
return ScaleCategory.MEDIUM
|
|
138
|
+
elif row_count < SCALE_THRESHOLDS[ScaleCategory.LARGE]:
|
|
139
|
+
return ScaleCategory.LARGE
|
|
140
|
+
elif row_count < SCALE_THRESHOLDS[ScaleCategory.XLARGE]:
|
|
141
|
+
return ScaleCategory.XLARGE
|
|
142
|
+
else:
|
|
143
|
+
return ScaleCategory.XXLARGE
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def calculate_cochran_sample_size(
|
|
147
|
+
population_size: int,
|
|
148
|
+
confidence_level: float = 0.95,
|
|
149
|
+
margin_of_error: float = 0.05,
|
|
150
|
+
p: float = 0.5,
|
|
151
|
+
) -> int:
|
|
152
|
+
"""Calculate optimal sample size using Cochran's formula.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
population_size: Total population size (N).
|
|
156
|
+
confidence_level: Desired confidence level (e.g., 0.95).
|
|
157
|
+
margin_of_error: Acceptable margin of error (e.g., 0.05).
|
|
158
|
+
p: Expected proportion (0.5 for maximum variability).
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Recommended sample size.
|
|
162
|
+
"""
|
|
163
|
+
# Z-scores for common confidence levels
|
|
164
|
+
z_scores = {
|
|
165
|
+
0.80: 1.28,
|
|
166
|
+
0.85: 1.44,
|
|
167
|
+
0.90: 1.645,
|
|
168
|
+
0.95: 1.96,
|
|
169
|
+
0.99: 2.576,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
# Get closest z-score
|
|
173
|
+
z = z_scores.get(confidence_level, 1.96)
|
|
174
|
+
|
|
175
|
+
# Cochran's formula for infinite population
|
|
176
|
+
n0 = (z**2 * p * (1 - p)) / (margin_of_error**2)
|
|
177
|
+
|
|
178
|
+
# Finite population correction
|
|
179
|
+
n = n0 / (1 + (n0 - 1) / population_size)
|
|
180
|
+
|
|
181
|
+
return max(int(math.ceil(n)), 100) # Minimum 100 samples
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def estimate_processing_time(
|
|
185
|
+
row_count: int,
|
|
186
|
+
strategy: EnterpriseSamplingStrategy,
|
|
187
|
+
workers: int = 4,
|
|
188
|
+
) -> float:
|
|
189
|
+
"""Estimate processing time in seconds.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
row_count: Number of rows to process.
|
|
193
|
+
strategy: Sampling strategy.
|
|
194
|
+
workers: Number of parallel workers.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Estimated time in seconds.
|
|
198
|
+
"""
|
|
199
|
+
# Base throughput estimates (rows/second per worker)
|
|
200
|
+
throughput_map = {
|
|
201
|
+
EnterpriseSamplingStrategy.NONE: 10_000_000, # Full scan
|
|
202
|
+
EnterpriseSamplingStrategy.RANDOM: 5_000_000,
|
|
203
|
+
EnterpriseSamplingStrategy.BLOCK: 2_000_000,
|
|
204
|
+
EnterpriseSamplingStrategy.MULTI_STAGE: 1_000_000,
|
|
205
|
+
EnterpriseSamplingStrategy.COLUMN_AWARE: 3_000_000,
|
|
206
|
+
EnterpriseSamplingStrategy.PROGRESSIVE: 2_500_000,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
base_throughput = throughput_map.get(strategy, 1_000_000)
|
|
210
|
+
|
|
211
|
+
# Parallel speedup (not perfectly linear)
|
|
212
|
+
parallel_efficiency = 0.7 if workers > 1 else 1.0
|
|
213
|
+
effective_throughput = base_throughput * workers * parallel_efficiency
|
|
214
|
+
|
|
215
|
+
return row_count / effective_throughput
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def estimate_memory_usage(
|
|
219
|
+
row_count: int,
|
|
220
|
+
column_count: int,
|
|
221
|
+
strategy: EnterpriseSamplingStrategy,
|
|
222
|
+
) -> float:
|
|
223
|
+
"""Estimate memory usage in MB.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
row_count: Number of rows.
|
|
227
|
+
column_count: Number of columns.
|
|
228
|
+
strategy: Sampling strategy.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Estimated memory in MB.
|
|
232
|
+
"""
|
|
233
|
+
# Base memory per row (rough estimate: 50 bytes per column)
|
|
234
|
+
bytes_per_row = column_count * 50
|
|
235
|
+
|
|
236
|
+
# Strategy-specific memory factors
|
|
237
|
+
memory_factors = {
|
|
238
|
+
EnterpriseSamplingStrategy.NONE: 1.0,
|
|
239
|
+
EnterpriseSamplingStrategy.RANDOM: 0.1,
|
|
240
|
+
EnterpriseSamplingStrategy.BLOCK: 0.2, # Block buffer
|
|
241
|
+
EnterpriseSamplingStrategy.MULTI_STAGE: 0.15,
|
|
242
|
+
EnterpriseSamplingStrategy.COLUMN_AWARE: 0.12,
|
|
243
|
+
EnterpriseSamplingStrategy.PROGRESSIVE: 0.1,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
factor = memory_factors.get(strategy, 0.1)
|
|
247
|
+
memory_bytes = row_count * bytes_per_row * factor
|
|
248
|
+
|
|
249
|
+
# Add overhead
|
|
250
|
+
memory_bytes *= 1.2
|
|
251
|
+
|
|
252
|
+
return memory_bytes / (1024 * 1024)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ============================================================================
|
|
256
|
+
# Sampling Result Data Classes
|
|
257
|
+
# ============================================================================
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@dataclass
|
|
261
|
+
class SamplingContext:
|
|
262
|
+
"""Context passed through the sampling pipeline."""
|
|
263
|
+
|
|
264
|
+
source_id: str
|
|
265
|
+
job_id: str
|
|
266
|
+
config: EnterpriseSamplingRequest
|
|
267
|
+
row_count: int
|
|
268
|
+
column_count: int
|
|
269
|
+
scale_category: ScaleCategory
|
|
270
|
+
start_time: float = field(default_factory=time.time)
|
|
271
|
+
|
|
272
|
+
# Runtime state
|
|
273
|
+
rows_processed: int = 0
|
|
274
|
+
blocks_completed: int = 0
|
|
275
|
+
blocks_total: int = 0
|
|
276
|
+
current_stage: str = "initializing"
|
|
277
|
+
|
|
278
|
+
# Memory tracking
|
|
279
|
+
peak_memory_mb: float = 0.0
|
|
280
|
+
backpressure_events: int = 0
|
|
281
|
+
|
|
282
|
+
def elapsed_ms(self) -> float:
|
|
283
|
+
"""Get elapsed time in milliseconds."""
|
|
284
|
+
return (time.time() - self.start_time) * 1000
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@dataclass
|
|
288
|
+
class SamplingOutput:
|
|
289
|
+
"""Output from sampling operation."""
|
|
290
|
+
|
|
291
|
+
sampled_data: Any # Polars DataFrame or LazyFrame
|
|
292
|
+
sampled_rows: int
|
|
293
|
+
output_path: str | None = None
|
|
294
|
+
|
|
295
|
+
# Strategy-specific metadata
|
|
296
|
+
blocks_processed: int | None = None
|
|
297
|
+
stages_completed: int | None = None
|
|
298
|
+
converged_early: bool | None = None
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ============================================================================
|
|
302
|
+
# Abstract Base Strategy
|
|
303
|
+
# ============================================================================
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class BaseSamplingStrategy(ABC):
|
|
307
|
+
"""Abstract base class for sampling strategies.
|
|
308
|
+
|
|
309
|
+
Implements Template Method pattern - subclasses implement
|
|
310
|
+
`_do_sample()` while base class handles common logic.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
@abstractmethod
|
|
315
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
316
|
+
"""Get strategy type identifier."""
|
|
317
|
+
...
|
|
318
|
+
|
|
319
|
+
@property
|
|
320
|
+
def supports_parallel(self) -> bool:
|
|
321
|
+
"""Whether strategy supports parallel execution."""
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
@property
|
|
325
|
+
def supports_streaming(self) -> bool:
|
|
326
|
+
"""Whether strategy supports streaming."""
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
async def sample(
|
|
330
|
+
self,
|
|
331
|
+
context: SamplingContext,
|
|
332
|
+
data: Any,
|
|
333
|
+
) -> SamplingOutput:
|
|
334
|
+
"""Execute sampling with common pre/post processing.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
context: Sampling context with configuration.
|
|
338
|
+
data: Input data (Polars LazyFrame).
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
SamplingOutput with sampled data.
|
|
342
|
+
"""
|
|
343
|
+
context.current_stage = f"{self.strategy_type.value}_sampling"
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
# Pre-sampling validation
|
|
347
|
+
self._validate_input(context, data)
|
|
348
|
+
|
|
349
|
+
# Execute strategy-specific sampling
|
|
350
|
+
output = await self._do_sample(context, data)
|
|
351
|
+
|
|
352
|
+
# Post-processing
|
|
353
|
+
output = self._post_process(context, output)
|
|
354
|
+
|
|
355
|
+
return output
|
|
356
|
+
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error(f"Sampling failed: {e}")
|
|
359
|
+
raise
|
|
360
|
+
|
|
361
|
+
def _validate_input(self, context: SamplingContext, data: Any) -> None:
|
|
362
|
+
"""Validate input data before sampling."""
|
|
363
|
+
if data is None:
|
|
364
|
+
raise ValueError("Input data cannot be None")
|
|
365
|
+
|
|
366
|
+
@abstractmethod
|
|
367
|
+
async def _do_sample(
|
|
368
|
+
self,
|
|
369
|
+
context: SamplingContext,
|
|
370
|
+
data: Any,
|
|
371
|
+
) -> SamplingOutput:
|
|
372
|
+
"""Strategy-specific sampling implementation.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
context: Sampling context.
|
|
376
|
+
data: Input data.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
SamplingOutput with results.
|
|
380
|
+
"""
|
|
381
|
+
...
|
|
382
|
+
|
|
383
|
+
def _post_process(
|
|
384
|
+
self,
|
|
385
|
+
context: SamplingContext,
|
|
386
|
+
output: SamplingOutput,
|
|
387
|
+
) -> SamplingOutput:
|
|
388
|
+
"""Post-process sampling output."""
|
|
389
|
+
return output
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# ============================================================================
|
|
393
|
+
# Concrete Strategies
|
|
394
|
+
# ============================================================================
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class NoSamplingStrategy(BaseSamplingStrategy):
|
|
398
|
+
"""No sampling - use full dataset."""
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
402
|
+
return EnterpriseSamplingStrategy.NONE
|
|
403
|
+
|
|
404
|
+
async def _do_sample(
|
|
405
|
+
self,
|
|
406
|
+
context: SamplingContext,
|
|
407
|
+
data: Any,
|
|
408
|
+
) -> SamplingOutput:
|
|
409
|
+
"""Return data as-is."""
|
|
410
|
+
return SamplingOutput(
|
|
411
|
+
sampled_data=data,
|
|
412
|
+
sampled_rows=context.row_count,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class BlockSamplingStrategy(BaseSamplingStrategy):
|
|
417
|
+
"""Block-based sampling for 10M-100M row datasets.
|
|
418
|
+
|
|
419
|
+
Divides data into fixed-size blocks and samples proportionally
|
|
420
|
+
from each block. Ensures even coverage across the dataset.
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
@property
|
|
424
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
425
|
+
return EnterpriseSamplingStrategy.BLOCK
|
|
426
|
+
|
|
427
|
+
@property
|
|
428
|
+
def supports_parallel(self) -> bool:
|
|
429
|
+
return True
|
|
430
|
+
|
|
431
|
+
def __init__(self, config: BlockSamplingConfig | None = None):
|
|
432
|
+
self.config = config or BlockSamplingConfig()
|
|
433
|
+
|
|
434
|
+
async def _do_sample(
|
|
435
|
+
self,
|
|
436
|
+
context: SamplingContext,
|
|
437
|
+
data: Any,
|
|
438
|
+
) -> SamplingOutput:
|
|
439
|
+
"""Perform block-based sampling."""
|
|
440
|
+
import polars as pl
|
|
441
|
+
|
|
442
|
+
target_rows = context.config.target_rows
|
|
443
|
+
|
|
444
|
+
# Calculate block size
|
|
445
|
+
block_size = self.config.block_size
|
|
446
|
+
if block_size == 0:
|
|
447
|
+
# Auto-detect: aim for ~100 blocks
|
|
448
|
+
block_size = max(context.row_count // 100, 10_000)
|
|
449
|
+
|
|
450
|
+
num_blocks = math.ceil(context.row_count / block_size)
|
|
451
|
+
context.blocks_total = num_blocks
|
|
452
|
+
|
|
453
|
+
# Calculate samples per block
|
|
454
|
+
samples_per_block = self.config.sample_per_block
|
|
455
|
+
if samples_per_block is None:
|
|
456
|
+
samples_per_block = max(target_rows // num_blocks, 1)
|
|
457
|
+
|
|
458
|
+
logger.info(
|
|
459
|
+
f"Block sampling: {num_blocks} blocks, "
|
|
460
|
+
f"{samples_per_block} samples/block"
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# Collect data and sample from each block
|
|
464
|
+
# In production, this would use truthound's block sampler
|
|
465
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
466
|
+
seed = context.config.seed or 42
|
|
467
|
+
|
|
468
|
+
sampled_dfs = []
|
|
469
|
+
for i in range(num_blocks):
|
|
470
|
+
start_idx = i * block_size
|
|
471
|
+
end_idx = min((i + 1) * block_size, len(df))
|
|
472
|
+
block = df.slice(start_idx, end_idx - start_idx)
|
|
473
|
+
|
|
474
|
+
if len(block) > samples_per_block:
|
|
475
|
+
block = block.sample(n=samples_per_block, seed=seed + i)
|
|
476
|
+
|
|
477
|
+
sampled_dfs.append(block)
|
|
478
|
+
context.blocks_completed = i + 1
|
|
479
|
+
|
|
480
|
+
# Combine sampled blocks
|
|
481
|
+
sampled = pl.concat(sampled_dfs)
|
|
482
|
+
|
|
483
|
+
# Trim to target if oversampled
|
|
484
|
+
if len(sampled) > target_rows:
|
|
485
|
+
sampled = sampled.sample(n=target_rows, seed=seed)
|
|
486
|
+
|
|
487
|
+
return SamplingOutput(
|
|
488
|
+
sampled_data=sampled.lazy(),
|
|
489
|
+
sampled_rows=len(sampled),
|
|
490
|
+
blocks_processed=num_blocks,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
class MultiStageSamplingStrategy(BaseSamplingStrategy):
|
|
495
|
+
"""Multi-stage hierarchical sampling for 100M-1B row datasets.
|
|
496
|
+
|
|
497
|
+
Progressively reduces data in multiple stages. Each stage
|
|
498
|
+
reduces by factor (total_rows / target)^(1/stages).
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
@property
|
|
502
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
503
|
+
return EnterpriseSamplingStrategy.MULTI_STAGE
|
|
504
|
+
|
|
505
|
+
def __init__(self, config: MultiStageSamplingConfig | None = None):
|
|
506
|
+
self.config = config or MultiStageSamplingConfig()
|
|
507
|
+
|
|
508
|
+
async def _do_sample(
|
|
509
|
+
self,
|
|
510
|
+
context: SamplingContext,
|
|
511
|
+
data: Any,
|
|
512
|
+
) -> SamplingOutput:
|
|
513
|
+
"""Perform multi-stage sampling."""
|
|
514
|
+
import polars as pl
|
|
515
|
+
|
|
516
|
+
target_rows = context.config.target_rows
|
|
517
|
+
num_stages = self.config.num_stages
|
|
518
|
+
seed = context.config.seed or 42
|
|
519
|
+
|
|
520
|
+
# Calculate reduction factor per stage
|
|
521
|
+
if self.config.stage_reduction_factor:
|
|
522
|
+
reduction = self.config.stage_reduction_factor
|
|
523
|
+
else:
|
|
524
|
+
reduction = (context.row_count / target_rows) ** (1 / num_stages)
|
|
525
|
+
|
|
526
|
+
logger.info(
|
|
527
|
+
f"Multi-stage sampling: {num_stages} stages, "
|
|
528
|
+
f"{reduction:.2f}x reduction per stage"
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Collect initial data
|
|
532
|
+
current_data = data.collect() if hasattr(data, "collect") else data
|
|
533
|
+
current_rows = len(current_data)
|
|
534
|
+
|
|
535
|
+
stages_completed = 0
|
|
536
|
+
for stage in range(num_stages):
|
|
537
|
+
target_stage_rows = int(current_rows / reduction)
|
|
538
|
+
target_stage_rows = max(target_stage_rows, target_rows)
|
|
539
|
+
|
|
540
|
+
if target_stage_rows >= current_rows:
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
current_data = current_data.sample(
|
|
544
|
+
n=target_stage_rows,
|
|
545
|
+
seed=seed + stage,
|
|
546
|
+
)
|
|
547
|
+
current_rows = len(current_data)
|
|
548
|
+
stages_completed = stage + 1
|
|
549
|
+
|
|
550
|
+
logger.debug(f"Stage {stage + 1}: {current_rows} rows")
|
|
551
|
+
|
|
552
|
+
# Early stopping check
|
|
553
|
+
if self.config.early_stop_enabled and current_rows <= target_rows:
|
|
554
|
+
break
|
|
555
|
+
|
|
556
|
+
# Final trim to exact target
|
|
557
|
+
if current_rows > target_rows:
|
|
558
|
+
current_data = current_data.sample(n=target_rows, seed=seed)
|
|
559
|
+
|
|
560
|
+
return SamplingOutput(
|
|
561
|
+
sampled_data=current_data.lazy(),
|
|
562
|
+
sampled_rows=len(current_data),
|
|
563
|
+
stages_completed=stages_completed,
|
|
564
|
+
converged_early=stages_completed < num_stages,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class ColumnAwareSamplingStrategy(BaseSamplingStrategy):
|
|
569
|
+
"""Column-aware adaptive sampling for mixed-type datasets.
|
|
570
|
+
|
|
571
|
+
Adjusts sample size based on column type complexity:
|
|
572
|
+
- Strings: 2x multiplier (high cardinality)
|
|
573
|
+
- Categoricals: 0.5x multiplier (low cardinality)
|
|
574
|
+
- Complex types: 3x multiplier (List/Struct)
|
|
575
|
+
- Numeric: 1x baseline
|
|
576
|
+
"""
|
|
577
|
+
|
|
578
|
+
@property
|
|
579
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
580
|
+
return EnterpriseSamplingStrategy.COLUMN_AWARE
|
|
581
|
+
|
|
582
|
+
def __init__(self, config: ColumnAwareSamplingConfig | None = None):
|
|
583
|
+
self.config = config or ColumnAwareSamplingConfig()
|
|
584
|
+
|
|
585
|
+
async def _do_sample(
|
|
586
|
+
self,
|
|
587
|
+
context: SamplingContext,
|
|
588
|
+
data: Any,
|
|
589
|
+
) -> SamplingOutput:
|
|
590
|
+
"""Perform column-aware sampling."""
|
|
591
|
+
import polars as pl
|
|
592
|
+
|
|
593
|
+
target_rows = context.config.target_rows
|
|
594
|
+
seed = context.config.seed or 42
|
|
595
|
+
|
|
596
|
+
# Collect schema info
|
|
597
|
+
if hasattr(data, "collect_schema"):
|
|
598
|
+
schema = data.collect_schema()
|
|
599
|
+
else:
|
|
600
|
+
schema = data.schema
|
|
601
|
+
|
|
602
|
+
# Calculate adjusted sample size based on column types
|
|
603
|
+
type_multipliers = []
|
|
604
|
+
for col_name, dtype in schema.items():
|
|
605
|
+
dtype_str = str(dtype).lower()
|
|
606
|
+
|
|
607
|
+
if "string" in dtype_str or "utf8" in dtype_str:
|
|
608
|
+
type_multipliers.append(self.config.string_multiplier)
|
|
609
|
+
elif "categorical" in dtype_str or "enum" in dtype_str:
|
|
610
|
+
type_multipliers.append(self.config.categorical_multiplier)
|
|
611
|
+
elif "list" in dtype_str or "struct" in dtype_str:
|
|
612
|
+
type_multipliers.append(self.config.complex_multiplier)
|
|
613
|
+
else:
|
|
614
|
+
type_multipliers.append(self.config.numeric_multiplier)
|
|
615
|
+
|
|
616
|
+
# Use average multiplier
|
|
617
|
+
avg_multiplier = sum(type_multipliers) / len(type_multipliers)
|
|
618
|
+
adjusted_target = int(target_rows * avg_multiplier)
|
|
619
|
+
adjusted_target = min(adjusted_target, context.row_count)
|
|
620
|
+
|
|
621
|
+
logger.info(
|
|
622
|
+
f"Column-aware sampling: {len(type_multipliers)} columns, "
|
|
623
|
+
f"avg multiplier {avg_multiplier:.2f}, "
|
|
624
|
+
f"adjusted target {adjusted_target}"
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# Perform sampling
|
|
628
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
629
|
+
|
|
630
|
+
if len(df) > adjusted_target:
|
|
631
|
+
df = df.sample(n=adjusted_target, seed=seed)
|
|
632
|
+
|
|
633
|
+
return SamplingOutput(
|
|
634
|
+
sampled_data=df.lazy(),
|
|
635
|
+
sampled_rows=len(df),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
class ProgressiveSamplingStrategy(BaseSamplingStrategy):
|
|
640
|
+
"""Progressive sampling with convergence detection.
|
|
641
|
+
|
|
642
|
+
Iteratively increases sample size until estimates stabilize
|
|
643
|
+
within convergence threshold. Supports early stopping.
|
|
644
|
+
"""
|
|
645
|
+
|
|
646
|
+
@property
|
|
647
|
+
def strategy_type(self) -> EnterpriseSamplingStrategy:
|
|
648
|
+
return EnterpriseSamplingStrategy.PROGRESSIVE
|
|
649
|
+
|
|
650
|
+
def __init__(self, config: ProgressiveSamplingConfig | None = None):
|
|
651
|
+
self.config = config or ProgressiveSamplingConfig()
|
|
652
|
+
|
|
653
|
+
async def _do_sample(
|
|
654
|
+
self,
|
|
655
|
+
context: SamplingContext,
|
|
656
|
+
data: Any,
|
|
657
|
+
) -> SamplingOutput:
|
|
658
|
+
"""Perform progressive sampling."""
|
|
659
|
+
import polars as pl
|
|
660
|
+
|
|
661
|
+
target_rows = context.config.target_rows
|
|
662
|
+
seed = context.config.seed or 42
|
|
663
|
+
|
|
664
|
+
# Collect data
|
|
665
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
666
|
+
total_rows = len(df)
|
|
667
|
+
|
|
668
|
+
# Initial sample size
|
|
669
|
+
current_size = int(total_rows * self.config.initial_sample_ratio)
|
|
670
|
+
current_size = max(current_size, 1000)
|
|
671
|
+
|
|
672
|
+
# Track estimates for convergence check
|
|
673
|
+
prev_estimates: dict[str, float] = {}
|
|
674
|
+
stages_completed = 0
|
|
675
|
+
converged = False
|
|
676
|
+
|
|
677
|
+
for stage in range(self.config.max_stages):
|
|
678
|
+
# Sample current size
|
|
679
|
+
sample = df.sample(n=min(current_size, total_rows), seed=seed + stage)
|
|
680
|
+
stages_completed = stage + 1
|
|
681
|
+
|
|
682
|
+
# Calculate summary statistics for convergence check
|
|
683
|
+
numeric_cols = sample.select(pl.selectors.numeric()).columns
|
|
684
|
+
if numeric_cols:
|
|
685
|
+
estimates = {}
|
|
686
|
+
for col in numeric_cols[:5]: # Check first 5 numeric columns
|
|
687
|
+
mean = sample[col].mean()
|
|
688
|
+
if mean is not None:
|
|
689
|
+
estimates[col] = float(mean)
|
|
690
|
+
|
|
691
|
+
# Check convergence
|
|
692
|
+
if prev_estimates:
|
|
693
|
+
max_change = 0.0
|
|
694
|
+
for col, val in estimates.items():
|
|
695
|
+
if col in prev_estimates and prev_estimates[col] != 0:
|
|
696
|
+
change = abs(val - prev_estimates[col]) / abs(prev_estimates[col])
|
|
697
|
+
max_change = max(max_change, change)
|
|
698
|
+
|
|
699
|
+
if max_change < self.config.convergence_threshold:
|
|
700
|
+
converged = True
|
|
701
|
+
logger.info(f"Converged at stage {stage + 1} with change {max_change:.4f}")
|
|
702
|
+
break
|
|
703
|
+
|
|
704
|
+
prev_estimates = estimates
|
|
705
|
+
|
|
706
|
+
# Check if reached target
|
|
707
|
+
if current_size >= target_rows:
|
|
708
|
+
break
|
|
709
|
+
|
|
710
|
+
# Grow sample size
|
|
711
|
+
current_size = int(current_size * self.config.growth_factor)
|
|
712
|
+
current_size = min(current_size, target_rows)
|
|
713
|
+
|
|
714
|
+
# Final sample at target size
|
|
715
|
+
final_sample = df.sample(n=min(target_rows, total_rows), seed=seed)
|
|
716
|
+
|
|
717
|
+
return SamplingOutput(
|
|
718
|
+
sampled_data=final_sample.lazy(),
|
|
719
|
+
sampled_rows=len(final_sample),
|
|
720
|
+
stages_completed=stages_completed,
|
|
721
|
+
converged_early=converged,
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
# ============================================================================
|
|
726
|
+
# Strategy Factory
|
|
727
|
+
# ============================================================================
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
class SamplingStrategyFactory:
|
|
731
|
+
"""Factory for creating sampling strategies."""
|
|
732
|
+
|
|
733
|
+
_strategies: dict[EnterpriseSamplingStrategy, type[BaseSamplingStrategy]] = {
|
|
734
|
+
EnterpriseSamplingStrategy.NONE: NoSamplingStrategy,
|
|
735
|
+
EnterpriseSamplingStrategy.BLOCK: BlockSamplingStrategy,
|
|
736
|
+
EnterpriseSamplingStrategy.MULTI_STAGE: MultiStageSamplingStrategy,
|
|
737
|
+
EnterpriseSamplingStrategy.COLUMN_AWARE: ColumnAwareSamplingStrategy,
|
|
738
|
+
EnterpriseSamplingStrategy.PROGRESSIVE: ProgressiveSamplingStrategy,
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
@classmethod
|
|
742
|
+
def create(
|
|
743
|
+
cls,
|
|
744
|
+
strategy: EnterpriseSamplingStrategy,
|
|
745
|
+
config: EnterpriseSamplingRequest,
|
|
746
|
+
) -> BaseSamplingStrategy:
|
|
747
|
+
"""Create a sampling strategy instance.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
strategy: Strategy type to create.
|
|
751
|
+
config: Sampling configuration.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
Strategy instance.
|
|
755
|
+
"""
|
|
756
|
+
strategy_class = cls._strategies.get(strategy)
|
|
757
|
+
|
|
758
|
+
if strategy_class is None:
|
|
759
|
+
# Fall back to adaptive selection
|
|
760
|
+
logger.warning(f"Strategy {strategy} not found, using column-aware")
|
|
761
|
+
strategy_class = ColumnAwareSamplingStrategy
|
|
762
|
+
|
|
763
|
+
# Pass strategy-specific config if available
|
|
764
|
+
if strategy == EnterpriseSamplingStrategy.BLOCK and config.block_config:
|
|
765
|
+
return BlockSamplingStrategy(config.block_config)
|
|
766
|
+
elif strategy == EnterpriseSamplingStrategy.MULTI_STAGE and config.multi_stage_config:
|
|
767
|
+
return MultiStageSamplingStrategy(config.multi_stage_config)
|
|
768
|
+
elif strategy == EnterpriseSamplingStrategy.COLUMN_AWARE and config.column_aware_config:
|
|
769
|
+
return ColumnAwareSamplingStrategy(config.column_aware_config)
|
|
770
|
+
elif strategy == EnterpriseSamplingStrategy.PROGRESSIVE and config.progressive_config:
|
|
771
|
+
return ProgressiveSamplingStrategy(config.progressive_config)
|
|
772
|
+
|
|
773
|
+
return strategy_class()
|
|
774
|
+
|
|
775
|
+
@classmethod
|
|
776
|
+
def register(
|
|
777
|
+
cls,
|
|
778
|
+
strategy_type: EnterpriseSamplingStrategy,
|
|
779
|
+
strategy_class: type[BaseSamplingStrategy],
|
|
780
|
+
) -> None:
|
|
781
|
+
"""Register a custom sampling strategy.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
strategy_type: Strategy identifier.
|
|
785
|
+
strategy_class: Strategy class.
|
|
786
|
+
"""
|
|
787
|
+
cls._strategies[strategy_type] = strategy_class
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
# ============================================================================
|
|
791
|
+
# Enterprise Scale Sampler (Orchestrator)
|
|
792
|
+
# ============================================================================
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
class EnterpriseScaleSampler:
|
|
796
|
+
"""Main orchestrator for enterprise-scale sampling.
|
|
797
|
+
|
|
798
|
+
Auto-selects the best sampling strategy based on dataset scale
|
|
799
|
+
and executes sampling with full observability.
|
|
800
|
+
|
|
801
|
+
Example:
|
|
802
|
+
sampler = EnterpriseScaleSampler()
|
|
803
|
+
response = await sampler.sample(source_id, config)
|
|
804
|
+
"""
|
|
805
|
+
|
|
806
|
+
def __init__(self) -> None:
|
|
807
|
+
self._active_jobs: dict[str, SamplingJobStatus] = {}
|
|
808
|
+
|
|
809
|
+
async def sample(
|
|
810
|
+
self,
|
|
811
|
+
config: EnterpriseSamplingRequest,
|
|
812
|
+
data: Any,
|
|
813
|
+
row_count: int,
|
|
814
|
+
column_count: int,
|
|
815
|
+
) -> EnterpriseSamplingResponse:
|
|
816
|
+
"""Execute enterprise-scale sampling.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
config: Sampling configuration.
|
|
820
|
+
data: Input data (Polars LazyFrame).
|
|
821
|
+
row_count: Total row count.
|
|
822
|
+
column_count: Total column count.
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
EnterpriseSamplingResponse with results.
|
|
826
|
+
"""
|
|
827
|
+
job_id = str(uuid.uuid4())
|
|
828
|
+
started_at = datetime.utcnow()
|
|
829
|
+
|
|
830
|
+
# Classify scale
|
|
831
|
+
scale = classify_dataset_scale(row_count)
|
|
832
|
+
|
|
833
|
+
# Create context
|
|
834
|
+
context = SamplingContext(
|
|
835
|
+
source_id=config.source_id,
|
|
836
|
+
job_id=job_id,
|
|
837
|
+
config=config,
|
|
838
|
+
row_count=row_count,
|
|
839
|
+
column_count=column_count,
|
|
840
|
+
scale_category=scale,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
# Track job
|
|
844
|
+
self._active_jobs[job_id] = SamplingJobStatus(
|
|
845
|
+
job_id=job_id,
|
|
846
|
+
source_id=config.source_id,
|
|
847
|
+
status="running",
|
|
848
|
+
progress=0.0,
|
|
849
|
+
current_stage="initializing",
|
|
850
|
+
started_at=started_at,
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
try:
|
|
854
|
+
# Select strategy
|
|
855
|
+
strategy_type = self._select_strategy(config, scale)
|
|
856
|
+
|
|
857
|
+
# Create strategy
|
|
858
|
+
strategy = SamplingStrategyFactory.create(strategy_type, config)
|
|
859
|
+
|
|
860
|
+
# Execute sampling
|
|
861
|
+
output = await strategy.sample(context, data)
|
|
862
|
+
|
|
863
|
+
# Build metrics
|
|
864
|
+
metrics = SamplingMetrics(
|
|
865
|
+
original_rows=row_count,
|
|
866
|
+
sampled_rows=output.sampled_rows,
|
|
867
|
+
sampling_ratio=output.sampled_rows / row_count if row_count > 0 else 1.0,
|
|
868
|
+
strategy_used=strategy.strategy_type,
|
|
869
|
+
scale_category=scale,
|
|
870
|
+
is_sampled=output.sampled_rows < row_count,
|
|
871
|
+
sampling_time_ms=context.elapsed_ms(),
|
|
872
|
+
throughput_rows_per_sec=row_count / (context.elapsed_ms() / 1000) if context.elapsed_ms() > 0 else 0,
|
|
873
|
+
speedup_factor=row_count / output.sampled_rows if output.sampled_rows > 0 else 1.0,
|
|
874
|
+
peak_memory_mb=context.peak_memory_mb,
|
|
875
|
+
workers_used=config.block_config.parallel.max_workers if config.block_config else 1,
|
|
876
|
+
blocks_processed=output.blocks_processed,
|
|
877
|
+
stages_completed=output.stages_completed,
|
|
878
|
+
converged_early=output.converged_early,
|
|
879
|
+
backpressure_events=context.backpressure_events,
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
# Update job status
|
|
883
|
+
self._active_jobs[job_id].status = "completed"
|
|
884
|
+
self._active_jobs[job_id].progress = 1.0
|
|
885
|
+
|
|
886
|
+
return EnterpriseSamplingResponse(
|
|
887
|
+
source_id=config.source_id,
|
|
888
|
+
job_id=job_id,
|
|
889
|
+
status="completed",
|
|
890
|
+
started_at=started_at,
|
|
891
|
+
completed_at=datetime.utcnow(),
|
|
892
|
+
metrics=metrics,
|
|
893
|
+
sampled_data_path=output.output_path,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
except Exception as e:
|
|
897
|
+
logger.error(f"Sampling failed for job {job_id}: {e}")
|
|
898
|
+
|
|
899
|
+
self._active_jobs[job_id].status = "failed"
|
|
900
|
+
|
|
901
|
+
return EnterpriseSamplingResponse(
|
|
902
|
+
source_id=config.source_id,
|
|
903
|
+
job_id=job_id,
|
|
904
|
+
status="failed",
|
|
905
|
+
started_at=started_at,
|
|
906
|
+
completed_at=datetime.utcnow(),
|
|
907
|
+
error_message=str(e),
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
def _select_strategy(
|
|
911
|
+
self,
|
|
912
|
+
config: EnterpriseSamplingRequest,
|
|
913
|
+
scale: ScaleCategory,
|
|
914
|
+
) -> EnterpriseSamplingStrategy:
|
|
915
|
+
"""Select best sampling strategy.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
config: Sampling configuration.
|
|
919
|
+
scale: Dataset scale category.
|
|
920
|
+
|
|
921
|
+
Returns:
|
|
922
|
+
Selected strategy type.
|
|
923
|
+
"""
|
|
924
|
+
# If explicitly specified, use it
|
|
925
|
+
if config.strategy != EnterpriseSamplingStrategy.ADAPTIVE:
|
|
926
|
+
return config.strategy
|
|
927
|
+
|
|
928
|
+
# Auto-select based on scale
|
|
929
|
+
return SCALE_STRATEGY_MAP.get(scale, EnterpriseSamplingStrategy.COLUMN_AWARE)
|
|
930
|
+
|
|
931
|
+
def get_job_status(self, job_id: str) -> SamplingJobStatus | None:
|
|
932
|
+
"""Get status of a sampling job.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
job_id: Job identifier.
|
|
936
|
+
|
|
937
|
+
Returns:
|
|
938
|
+
Job status or None if not found.
|
|
939
|
+
"""
|
|
940
|
+
return self._active_jobs.get(job_id)
|
|
941
|
+
|
|
942
|
+
def list_jobs(self) -> list[SamplingJobStatus]:
|
|
943
|
+
"""List all sampling jobs.
|
|
944
|
+
|
|
945
|
+
Returns:
|
|
946
|
+
List of job statuses.
|
|
947
|
+
"""
|
|
948
|
+
return list(self._active_jobs.values())
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
# ============================================================================
|
|
952
|
+
# Sample Size Estimator
|
|
953
|
+
# ============================================================================
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
class SampleSizeEstimator:
|
|
957
|
+
"""Estimates optimal sample sizes and provides recommendations."""
|
|
958
|
+
|
|
959
|
+
def estimate(self, request: SampleSizeEstimateRequest) -> SampleSizeEstimateResponse:
|
|
960
|
+
"""Estimate optimal sample size.
|
|
961
|
+
|
|
962
|
+
Args:
|
|
963
|
+
request: Estimation request.
|
|
964
|
+
|
|
965
|
+
Returns:
|
|
966
|
+
Estimation response with recommendations.
|
|
967
|
+
"""
|
|
968
|
+
population_size = request.population_size
|
|
969
|
+
scale = classify_dataset_scale(population_size)
|
|
970
|
+
|
|
971
|
+
# Apply quality preset
|
|
972
|
+
preset = QUALITY_PRESETS.get(request.quality, QUALITY_PRESETS[SamplingQuality.STANDARD])
|
|
973
|
+
|
|
974
|
+
# Calculate sample size using Cochran's formula
|
|
975
|
+
recommended = calculate_cochran_sample_size(
|
|
976
|
+
population_size=population_size,
|
|
977
|
+
confidence_level=request.confidence_level,
|
|
978
|
+
margin_of_error=request.margin_of_error,
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Apply preset target if specified
|
|
982
|
+
if preset["target_rows"] is not None:
|
|
983
|
+
recommended = max(recommended, preset["target_rows"])
|
|
984
|
+
|
|
985
|
+
# Calculate bounds
|
|
986
|
+
min_size = max(recommended // 2, 100)
|
|
987
|
+
max_size = min(recommended * 10, population_size)
|
|
988
|
+
|
|
989
|
+
# Get recommended strategy
|
|
990
|
+
strategy = SCALE_STRATEGY_MAP.get(scale, EnterpriseSamplingStrategy.COLUMN_AWARE)
|
|
991
|
+
|
|
992
|
+
# Estimate time and memory
|
|
993
|
+
estimated_time = estimate_processing_time(population_size, strategy)
|
|
994
|
+
estimated_memory = estimate_memory_usage(population_size, 50, strategy) # Assume 50 columns
|
|
995
|
+
|
|
996
|
+
# Calculate speedup
|
|
997
|
+
speedup = population_size / recommended if recommended > 0 else 1.0
|
|
998
|
+
|
|
999
|
+
# Build rationale
|
|
1000
|
+
rationale = self._build_rationale(scale, strategy, population_size)
|
|
1001
|
+
|
|
1002
|
+
return SampleSizeEstimateResponse(
|
|
1003
|
+
population_size=population_size,
|
|
1004
|
+
scale_category=scale,
|
|
1005
|
+
recommended_size=recommended,
|
|
1006
|
+
min_size=min_size,
|
|
1007
|
+
max_size=max_size,
|
|
1008
|
+
estimated_time_seconds=estimated_time,
|
|
1009
|
+
estimated_memory_mb=estimated_memory,
|
|
1010
|
+
speedup_factor=speedup,
|
|
1011
|
+
recommended_strategy=strategy,
|
|
1012
|
+
strategy_rationale=rationale,
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
def _build_rationale(
|
|
1016
|
+
self,
|
|
1017
|
+
scale: ScaleCategory,
|
|
1018
|
+
strategy: EnterpriseSamplingStrategy,
|
|
1019
|
+
population_size: int,
|
|
1020
|
+
) -> str:
|
|
1021
|
+
"""Build rationale for strategy recommendation."""
|
|
1022
|
+
rationales = {
|
|
1023
|
+
ScaleCategory.SMALL: "Dataset is small enough for full scan without sampling.",
|
|
1024
|
+
ScaleCategory.MEDIUM: "Column-aware sampling adapts to data types for optimal accuracy.",
|
|
1025
|
+
ScaleCategory.LARGE: "Block sampling ensures even coverage across the dataset with parallel processing.",
|
|
1026
|
+
ScaleCategory.XLARGE: "Multi-stage sampling efficiently reduces billion-row datasets through hierarchical processing.",
|
|
1027
|
+
ScaleCategory.XXLARGE: "Multi-stage sampling with probabilistic sketches for extreme-scale datasets.",
|
|
1028
|
+
}
|
|
1029
|
+
return rationales.get(scale, "Adaptive sampling based on data characteristics.")
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
# ============================================================================
|
|
1033
|
+
# Sketch Estimator (Probabilistic Data Structures)
|
|
1034
|
+
# ============================================================================
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
class SketchEstimator:
|
|
1038
|
+
"""Estimates using truthound probabilistic data structures for 10B+ row datasets.
|
|
1039
|
+
|
|
1040
|
+
Uses truthound.profiler.sketches for O(1) memory aggregations:
|
|
1041
|
+
- HyperLogLog: Cardinality estimation (±0.41% error at precision=14)
|
|
1042
|
+
- CountMinSketch: Frequency estimation and heavy hitters detection
|
|
1043
|
+
- BloomFilter: Membership testing with configurable false positive rate
|
|
1044
|
+
"""
|
|
1045
|
+
|
|
1046
|
+
async def estimate(self, request: SketchEstimateRequest, data: Any) -> SketchEstimateResponse:
|
|
1047
|
+
"""Run sketch-based estimation.
|
|
1048
|
+
|
|
1049
|
+
Args:
|
|
1050
|
+
request: Sketch estimation request.
|
|
1051
|
+
data: Input data.
|
|
1052
|
+
|
|
1053
|
+
Returns:
|
|
1054
|
+
Sketch estimation response.
|
|
1055
|
+
"""
|
|
1056
|
+
start_time = time.time()
|
|
1057
|
+
results: list[SketchEstimateResult] = []
|
|
1058
|
+
total_memory = 0
|
|
1059
|
+
|
|
1060
|
+
config = request.sketch_config or SketchConfig()
|
|
1061
|
+
|
|
1062
|
+
for column in request.columns:
|
|
1063
|
+
col_start = time.time()
|
|
1064
|
+
|
|
1065
|
+
if config.sketch_type == SketchType.HYPERLOGLOG:
|
|
1066
|
+
result = await self._estimate_cardinality(column, data, config)
|
|
1067
|
+
elif config.sketch_type == SketchType.COUNTMIN:
|
|
1068
|
+
result = await self._estimate_frequency(column, data, config)
|
|
1069
|
+
else:
|
|
1070
|
+
result = await self._test_membership(column, data, config)
|
|
1071
|
+
|
|
1072
|
+
result.processing_time_ms = (time.time() - col_start) * 1000
|
|
1073
|
+
results.append(result)
|
|
1074
|
+
total_memory += result.memory_used_bytes
|
|
1075
|
+
|
|
1076
|
+
return SketchEstimateResponse(
|
|
1077
|
+
source_id=request.source_id,
|
|
1078
|
+
results=results,
|
|
1079
|
+
total_time_ms=(time.time() - start_time) * 1000,
|
|
1080
|
+
total_memory_mb=total_memory / (1024 * 1024),
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
async def _estimate_cardinality(
|
|
1084
|
+
self,
|
|
1085
|
+
column: str,
|
|
1086
|
+
data: Any,
|
|
1087
|
+
config: SketchConfig,
|
|
1088
|
+
) -> SketchEstimateResult:
|
|
1089
|
+
"""Estimate cardinality using truthound's HyperLogLog."""
|
|
1090
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
1091
|
+
|
|
1092
|
+
try:
|
|
1093
|
+
from truthound.profiler.sketches import HyperLogLog, HyperLogLogConfig
|
|
1094
|
+
|
|
1095
|
+
# Create HyperLogLog with specified precision
|
|
1096
|
+
hll_config = HyperLogLogConfig(precision=config.hll_precision)
|
|
1097
|
+
hll = HyperLogLog(hll_config)
|
|
1098
|
+
|
|
1099
|
+
# Add values in batches for efficiency
|
|
1100
|
+
column_values = df[column].drop_nulls().to_list()
|
|
1101
|
+
hll.add_batch(column_values)
|
|
1102
|
+
|
|
1103
|
+
# Get estimate and error
|
|
1104
|
+
cardinality_estimate = hll.estimate()
|
|
1105
|
+
cardinality_error = hll.standard_error()
|
|
1106
|
+
|
|
1107
|
+
# Calculate memory usage
|
|
1108
|
+
memory_bytes = (2 ** config.hll_precision) * 6 // 8
|
|
1109
|
+
|
|
1110
|
+
return SketchEstimateResult(
|
|
1111
|
+
column=column,
|
|
1112
|
+
sketch_type=SketchType.HYPERLOGLOG,
|
|
1113
|
+
cardinality_estimate=cardinality_estimate,
|
|
1114
|
+
cardinality_error=cardinality_error,
|
|
1115
|
+
memory_used_bytes=memory_bytes,
|
|
1116
|
+
processing_time_ms=0.0,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
except ImportError:
|
|
1120
|
+
logger.warning("truthound.profiler.sketches not available, using fallback")
|
|
1121
|
+
# Fallback to Polars n_unique
|
|
1122
|
+
unique_count = df[column].n_unique()
|
|
1123
|
+
error = 1.04 / math.sqrt(2 ** config.hll_precision)
|
|
1124
|
+
memory_bytes = (2 ** config.hll_precision) * 6 // 8
|
|
1125
|
+
|
|
1126
|
+
return SketchEstimateResult(
|
|
1127
|
+
column=column,
|
|
1128
|
+
sketch_type=SketchType.HYPERLOGLOG,
|
|
1129
|
+
cardinality_estimate=unique_count,
|
|
1130
|
+
cardinality_error=error,
|
|
1131
|
+
memory_used_bytes=memory_bytes,
|
|
1132
|
+
processing_time_ms=0.0,
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
async def _estimate_frequency(
|
|
1136
|
+
self,
|
|
1137
|
+
column: str,
|
|
1138
|
+
data: Any,
|
|
1139
|
+
config: SketchConfig,
|
|
1140
|
+
) -> SketchEstimateResult:
|
|
1141
|
+
"""Estimate frequencies using truthound's Count-Min Sketch."""
|
|
1142
|
+
import polars as pl
|
|
1143
|
+
|
|
1144
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
1145
|
+
|
|
1146
|
+
try:
|
|
1147
|
+
from truthound.profiler.sketches import CountMinSketch, CountMinSketchConfig
|
|
1148
|
+
|
|
1149
|
+
# Create Count-Min Sketch with specified dimensions
|
|
1150
|
+
cms_config = CountMinSketchConfig(
|
|
1151
|
+
width=config.cms_width,
|
|
1152
|
+
depth=config.cms_depth,
|
|
1153
|
+
)
|
|
1154
|
+
cms = CountMinSketch(cms_config)
|
|
1155
|
+
|
|
1156
|
+
# Add all values
|
|
1157
|
+
column_values = df[column].drop_nulls().to_list()
|
|
1158
|
+
for value in column_values:
|
|
1159
|
+
cms.add(value)
|
|
1160
|
+
|
|
1161
|
+
# Get heavy hitters (items appearing in >1% of stream)
|
|
1162
|
+
heavy_hitters_raw = cms.get_heavy_hitters(threshold=0.01)
|
|
1163
|
+
heavy_hitters = [
|
|
1164
|
+
{"value": str(item), "count": count}
|
|
1165
|
+
for item, count in heavy_hitters_raw[:10]
|
|
1166
|
+
]
|
|
1167
|
+
|
|
1168
|
+
# Memory = width * depth * 4 bytes (32-bit counters)
|
|
1169
|
+
memory_bytes = config.cms_width * config.cms_depth * 4
|
|
1170
|
+
|
|
1171
|
+
return SketchEstimateResult(
|
|
1172
|
+
column=column,
|
|
1173
|
+
sketch_type=SketchType.COUNTMIN,
|
|
1174
|
+
heavy_hitters=heavy_hitters,
|
|
1175
|
+
memory_used_bytes=memory_bytes,
|
|
1176
|
+
processing_time_ms=0.0,
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
except ImportError:
|
|
1180
|
+
logger.warning("truthound.profiler.sketches not available, using fallback")
|
|
1181
|
+
# Fallback to Polars group_by
|
|
1182
|
+
value_counts = (
|
|
1183
|
+
df.group_by(column)
|
|
1184
|
+
.agg(pl.len().alias("count"))
|
|
1185
|
+
.sort("count", descending=True)
|
|
1186
|
+
.head(10)
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
heavy_hitters = [
|
|
1190
|
+
{"value": str(row[column]), "count": row["count"]}
|
|
1191
|
+
for row in value_counts.iter_rows(named=True)
|
|
1192
|
+
]
|
|
1193
|
+
|
|
1194
|
+
memory_bytes = config.cms_width * config.cms_depth * 4
|
|
1195
|
+
|
|
1196
|
+
return SketchEstimateResult(
|
|
1197
|
+
column=column,
|
|
1198
|
+
sketch_type=SketchType.COUNTMIN,
|
|
1199
|
+
heavy_hitters=heavy_hitters,
|
|
1200
|
+
memory_used_bytes=memory_bytes,
|
|
1201
|
+
processing_time_ms=0.0,
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
async def _test_membership(
|
|
1205
|
+
self,
|
|
1206
|
+
column: str,
|
|
1207
|
+
data: Any,
|
|
1208
|
+
config: SketchConfig,
|
|
1209
|
+
) -> SketchEstimateResult:
|
|
1210
|
+
"""Test membership using truthound's Bloom Filter."""
|
|
1211
|
+
df = data.collect() if hasattr(data, "collect") else data
|
|
1212
|
+
|
|
1213
|
+
try:
|
|
1214
|
+
from truthound.profiler.sketches import BloomFilter, BloomFilterConfig
|
|
1215
|
+
|
|
1216
|
+
# Create Bloom Filter with specified capacity and error rate
|
|
1217
|
+
bf_config = BloomFilterConfig(
|
|
1218
|
+
capacity=config.bloom_capacity,
|
|
1219
|
+
error_rate=config.bloom_error_rate,
|
|
1220
|
+
)
|
|
1221
|
+
bf = BloomFilter(bf_config)
|
|
1222
|
+
|
|
1223
|
+
# Add all values
|
|
1224
|
+
column_values = df[column].drop_nulls().to_list()
|
|
1225
|
+
for value in column_values:
|
|
1226
|
+
bf.add(value)
|
|
1227
|
+
|
|
1228
|
+
# Get current false positive rate
|
|
1229
|
+
actual_fp_rate = bf.false_positive_rate()
|
|
1230
|
+
|
|
1231
|
+
# Calculate memory usage
|
|
1232
|
+
m = -config.bloom_capacity * math.log(config.bloom_error_rate) / (math.log(2) ** 2)
|
|
1233
|
+
memory_bytes = int(m / 8)
|
|
1234
|
+
|
|
1235
|
+
return SketchEstimateResult(
|
|
1236
|
+
column=column,
|
|
1237
|
+
sketch_type=SketchType.BLOOM,
|
|
1238
|
+
membership_tests={
|
|
1239
|
+
"items_added": len(column_values),
|
|
1240
|
+
"false_positive_rate": actual_fp_rate,
|
|
1241
|
+
},
|
|
1242
|
+
memory_used_bytes=memory_bytes,
|
|
1243
|
+
processing_time_ms=0.0,
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
except ImportError:
|
|
1247
|
+
logger.warning("truthound.profiler.sketches not available, using fallback")
|
|
1248
|
+
# Fallback: just calculate memory requirements
|
|
1249
|
+
m = -config.bloom_capacity * math.log(config.bloom_error_rate) / (math.log(2) ** 2)
|
|
1250
|
+
memory_bytes = int(m / 8)
|
|
1251
|
+
|
|
1252
|
+
return SketchEstimateResult(
|
|
1253
|
+
column=column,
|
|
1254
|
+
sketch_type=SketchType.BLOOM,
|
|
1255
|
+
membership_tests={},
|
|
1256
|
+
memory_used_bytes=memory_bytes,
|
|
1257
|
+
processing_time_ms=0.0,
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
# ============================================================================
|
|
1262
|
+
# Singleton Instance
|
|
1263
|
+
# ============================================================================
|
|
1264
|
+
|
|
1265
|
+
_sampler: EnterpriseScaleSampler | None = None
|
|
1266
|
+
_estimator: SampleSizeEstimator | None = None
|
|
1267
|
+
_sketch_estimator: SketchEstimator | None = None
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
def get_enterprise_sampler() -> EnterpriseScaleSampler:
|
|
1271
|
+
"""Get enterprise sampler singleton."""
|
|
1272
|
+
global _sampler
|
|
1273
|
+
if _sampler is None:
|
|
1274
|
+
_sampler = EnterpriseScaleSampler()
|
|
1275
|
+
return _sampler
|
|
1276
|
+
|
|
1277
|
+
|
|
1278
|
+
def get_sample_size_estimator() -> SampleSizeEstimator:
|
|
1279
|
+
"""Get sample size estimator singleton."""
|
|
1280
|
+
global _estimator
|
|
1281
|
+
if _estimator is None:
|
|
1282
|
+
_estimator = SampleSizeEstimator()
|
|
1283
|
+
return _estimator
|
|
1284
|
+
|
|
1285
|
+
|
|
1286
|
+
def get_sketch_estimator() -> SketchEstimator:
|
|
1287
|
+
"""Get sketch estimator singleton."""
|
|
1288
|
+
global _sketch_estimator
|
|
1289
|
+
if _sketch_estimator is None:
|
|
1290
|
+
_sketch_estimator = SketchEstimator()
|
|
1291
|
+
return _sketch_estimator
|