truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -22
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
- truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
- truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
"""Drift sampling strategies for large-scale datasets.
|
|
2
|
+
|
|
3
|
+
This module provides various sampling strategies optimized for drift detection
|
|
4
|
+
on 100M+ row datasets. It includes random, stratified, and reservoir sampling
|
|
5
|
+
with automatic sample size estimation based on confidence levels.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import math
|
|
12
|
+
import random
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import Any, Iterator, Sequence, TypeVar
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
T = TypeVar("T")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SamplingMethod(str, Enum):
|
|
26
|
+
"""Available sampling methods for large datasets."""
|
|
27
|
+
|
|
28
|
+
RANDOM = "random"
|
|
29
|
+
STRATIFIED = "stratified"
|
|
30
|
+
RESERVOIR = "reservoir"
|
|
31
|
+
SYSTEMATIC = "systematic"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SampleSizeEstimate:
|
|
36
|
+
"""Estimated sample size for drift detection.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
recommended_size: Recommended sample size for target confidence.
|
|
40
|
+
min_size: Minimum sample size for basic detection.
|
|
41
|
+
max_size: Maximum useful sample size (diminishing returns beyond).
|
|
42
|
+
confidence_level: Target confidence level (0.0-1.0).
|
|
43
|
+
margin_of_error: Expected margin of error at recommended size.
|
|
44
|
+
estimated_time_seconds: Estimated processing time.
|
|
45
|
+
memory_mb: Estimated memory usage in MB.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
recommended_size: int
|
|
49
|
+
min_size: int
|
|
50
|
+
max_size: int
|
|
51
|
+
confidence_level: float
|
|
52
|
+
margin_of_error: float
|
|
53
|
+
estimated_time_seconds: float
|
|
54
|
+
memory_mb: float
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ChunkedComparisonProgress:
|
|
59
|
+
"""Progress tracking for chunked comparison operations.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
total_chunks: Total number of chunks to process.
|
|
63
|
+
processed_chunks: Number of chunks already processed.
|
|
64
|
+
total_rows: Total rows across all chunks.
|
|
65
|
+
processed_rows: Rows processed so far.
|
|
66
|
+
current_chunk: Current chunk being processed.
|
|
67
|
+
elapsed_seconds: Time elapsed since start.
|
|
68
|
+
estimated_remaining_seconds: Estimated time remaining.
|
|
69
|
+
columns_with_drift: Columns detected with drift so far.
|
|
70
|
+
early_stop_triggered: Whether early stopping was triggered.
|
|
71
|
+
status: Current status (running, completed, cancelled, error).
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
total_chunks: int
|
|
75
|
+
processed_chunks: int
|
|
76
|
+
total_rows: int
|
|
77
|
+
processed_rows: int
|
|
78
|
+
current_chunk: int
|
|
79
|
+
elapsed_seconds: float
|
|
80
|
+
estimated_remaining_seconds: float
|
|
81
|
+
columns_with_drift: list[str]
|
|
82
|
+
early_stop_triggered: bool
|
|
83
|
+
status: str # running, completed, cancelled, error
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class BaseSampler(ABC):
|
|
87
|
+
"""Abstract base class for sampling strategies."""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
|
|
91
|
+
"""Sample data from the input sequence.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
data: Input data sequence.
|
|
95
|
+
sample_size: Number of samples to extract.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
List of sampled items.
|
|
99
|
+
"""
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
|
|
104
|
+
"""Generate sample indices for a dataset of given size.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
total_size: Total number of rows in dataset.
|
|
108
|
+
sample_size: Number of samples to extract.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of indices to sample.
|
|
112
|
+
"""
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class RandomSampler(BaseSampler):
|
|
117
|
+
"""Simple random sampling without replacement.
|
|
118
|
+
|
|
119
|
+
Best for: General-purpose sampling when no stratification is needed.
|
|
120
|
+
Time complexity: O(n) for data, O(k) for indices where k is sample size.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(self, seed: int | None = None) -> None:
|
|
124
|
+
"""Initialize random sampler.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
seed: Random seed for reproducibility.
|
|
128
|
+
"""
|
|
129
|
+
self.rng = random.Random(seed)
|
|
130
|
+
|
|
131
|
+
def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
|
|
132
|
+
"""Perform random sampling on data."""
|
|
133
|
+
if sample_size >= len(data):
|
|
134
|
+
return list(data)
|
|
135
|
+
indices = self.sample_indices(len(data), sample_size)
|
|
136
|
+
return [data[i] for i in indices]
|
|
137
|
+
|
|
138
|
+
def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
|
|
139
|
+
"""Generate random sample indices."""
|
|
140
|
+
if sample_size >= total_size:
|
|
141
|
+
return list(range(total_size))
|
|
142
|
+
return self.rng.sample(range(total_size), sample_size)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class StratifiedSampler(BaseSampler):
|
|
146
|
+
"""Stratified sampling based on a stratification column.
|
|
147
|
+
|
|
148
|
+
Best for: Ensuring representation of all categories in drift detection.
|
|
149
|
+
Time complexity: O(n) where n is data size.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
strata_column: str | int,
|
|
155
|
+
seed: int | None = None,
|
|
156
|
+
) -> None:
|
|
157
|
+
"""Initialize stratified sampler.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
strata_column: Column name or index for stratification.
|
|
161
|
+
seed: Random seed for reproducibility.
|
|
162
|
+
"""
|
|
163
|
+
self.strata_column = strata_column
|
|
164
|
+
self.rng = random.Random(seed)
|
|
165
|
+
|
|
166
|
+
def sample(self, data: Sequence[dict[str, Any]], sample_size: int) -> list[dict[str, Any]]:
|
|
167
|
+
"""Perform stratified sampling.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
data: Input data with dict rows.
|
|
171
|
+
sample_size: Total number of samples.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Stratified sample maintaining proportions.
|
|
175
|
+
"""
|
|
176
|
+
if sample_size >= len(data):
|
|
177
|
+
return list(data)
|
|
178
|
+
|
|
179
|
+
# Group by strata
|
|
180
|
+
strata: dict[Any, list[int]] = {}
|
|
181
|
+
for i, row in enumerate(data):
|
|
182
|
+
key = row.get(self.strata_column) if isinstance(row, dict) else row[self.strata_column]
|
|
183
|
+
if key not in strata:
|
|
184
|
+
strata[key] = []
|
|
185
|
+
strata[key].append(i)
|
|
186
|
+
|
|
187
|
+
# Calculate samples per stratum (proportional)
|
|
188
|
+
total = len(data)
|
|
189
|
+
result_indices: list[int] = []
|
|
190
|
+
|
|
191
|
+
for stratum_key, indices in strata.items():
|
|
192
|
+
proportion = len(indices) / total
|
|
193
|
+
stratum_sample_size = max(1, int(sample_size * proportion))
|
|
194
|
+
stratum_sample_size = min(stratum_sample_size, len(indices))
|
|
195
|
+
sampled = self.rng.sample(indices, stratum_sample_size)
|
|
196
|
+
result_indices.extend(sampled)
|
|
197
|
+
|
|
198
|
+
# If we have fewer samples than requested, top up randomly
|
|
199
|
+
if len(result_indices) < sample_size:
|
|
200
|
+
remaining = set(range(total)) - set(result_indices)
|
|
201
|
+
additional = self.rng.sample(
|
|
202
|
+
list(remaining), min(sample_size - len(result_indices), len(remaining))
|
|
203
|
+
)
|
|
204
|
+
result_indices.extend(additional)
|
|
205
|
+
|
|
206
|
+
return [data[i] for i in result_indices[:sample_size]]
|
|
207
|
+
|
|
208
|
+
def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
|
|
209
|
+
"""Generate stratified indices (falls back to random without strata info)."""
|
|
210
|
+
if sample_size >= total_size:
|
|
211
|
+
return list(range(total_size))
|
|
212
|
+
return self.rng.sample(range(total_size), sample_size)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class ReservoirSampler(BaseSampler):
|
|
216
|
+
"""Reservoir sampling for streaming data (Algorithm R).
|
|
217
|
+
|
|
218
|
+
Best for: Single-pass sampling of very large datasets or streams.
|
|
219
|
+
Time complexity: O(n) single pass.
|
|
220
|
+
Space complexity: O(k) where k is sample size.
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
def __init__(self, seed: int | None = None) -> None:
|
|
224
|
+
"""Initialize reservoir sampler.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
seed: Random seed for reproducibility.
|
|
228
|
+
"""
|
|
229
|
+
self.rng = random.Random(seed)
|
|
230
|
+
|
|
231
|
+
def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
|
|
232
|
+
"""Perform reservoir sampling."""
|
|
233
|
+
return list(self.sample_stream(iter(data), sample_size))
|
|
234
|
+
|
|
235
|
+
def sample_stream(self, stream: Iterator[T], sample_size: int) -> Iterator[T]:
|
|
236
|
+
"""Sample from a stream using reservoir sampling.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
stream: Input data stream.
|
|
240
|
+
sample_size: Number of samples to maintain.
|
|
241
|
+
|
|
242
|
+
Yields:
|
|
243
|
+
Sampled items after stream is exhausted.
|
|
244
|
+
"""
|
|
245
|
+
reservoir: list[T] = []
|
|
246
|
+
|
|
247
|
+
for i, item in enumerate(stream):
|
|
248
|
+
if i < sample_size:
|
|
249
|
+
reservoir.append(item)
|
|
250
|
+
else:
|
|
251
|
+
# Replace with decreasing probability
|
|
252
|
+
j = self.rng.randint(0, i)
|
|
253
|
+
if j < sample_size:
|
|
254
|
+
reservoir[j] = item
|
|
255
|
+
|
|
256
|
+
yield from reservoir
|
|
257
|
+
|
|
258
|
+
def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
|
|
259
|
+
"""Generate reservoir-style indices."""
|
|
260
|
+
if sample_size >= total_size:
|
|
261
|
+
return list(range(total_size))
|
|
262
|
+
|
|
263
|
+
reservoir = list(range(sample_size))
|
|
264
|
+
for i in range(sample_size, total_size):
|
|
265
|
+
j = self.rng.randint(0, i)
|
|
266
|
+
if j < sample_size:
|
|
267
|
+
reservoir[j] = i
|
|
268
|
+
|
|
269
|
+
return sorted(reservoir)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class SystematicSampler(BaseSampler):
|
|
273
|
+
"""Systematic sampling with random start.
|
|
274
|
+
|
|
275
|
+
Best for: Evenly distributed sampling across ordered data.
|
|
276
|
+
Time complexity: O(k) where k is sample size.
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
def __init__(self, seed: int | None = None) -> None:
|
|
280
|
+
"""Initialize systematic sampler.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
seed: Random seed for reproducibility.
|
|
284
|
+
"""
|
|
285
|
+
self.rng = random.Random(seed)
|
|
286
|
+
|
|
287
|
+
def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
|
|
288
|
+
"""Perform systematic sampling."""
|
|
289
|
+
indices = self.sample_indices(len(data), sample_size)
|
|
290
|
+
return [data[i] for i in indices]
|
|
291
|
+
|
|
292
|
+
def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
|
|
293
|
+
"""Generate systematic sample indices."""
|
|
294
|
+
if sample_size >= total_size:
|
|
295
|
+
return list(range(total_size))
|
|
296
|
+
|
|
297
|
+
interval = total_size / sample_size
|
|
298
|
+
start = self.rng.uniform(0, interval)
|
|
299
|
+
|
|
300
|
+
indices = []
|
|
301
|
+
for i in range(sample_size):
|
|
302
|
+
idx = int(start + i * interval)
|
|
303
|
+
if idx < total_size:
|
|
304
|
+
indices.append(idx)
|
|
305
|
+
|
|
306
|
+
return indices
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def get_sampler(method: SamplingMethod, **kwargs) -> BaseSampler:
|
|
310
|
+
"""Factory function to get the appropriate sampler.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
method: Sampling method to use.
|
|
314
|
+
**kwargs: Additional arguments for the sampler.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Configured sampler instance.
|
|
318
|
+
"""
|
|
319
|
+
seed = kwargs.get("seed")
|
|
320
|
+
|
|
321
|
+
if method == SamplingMethod.RANDOM:
|
|
322
|
+
return RandomSampler(seed=seed)
|
|
323
|
+
elif method == SamplingMethod.STRATIFIED:
|
|
324
|
+
strata_column = kwargs.get("strata_column", 0)
|
|
325
|
+
return StratifiedSampler(strata_column=strata_column, seed=seed)
|
|
326
|
+
elif method == SamplingMethod.RESERVOIR:
|
|
327
|
+
return ReservoirSampler(seed=seed)
|
|
328
|
+
elif method == SamplingMethod.SYSTEMATIC:
|
|
329
|
+
return SystematicSampler(seed=seed)
|
|
330
|
+
else:
|
|
331
|
+
raise ValueError(f"Unknown sampling method: {method}")
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def estimate_sample_size(
|
|
335
|
+
population_size: int,
|
|
336
|
+
confidence_level: float = 0.95,
|
|
337
|
+
margin_of_error: float = 0.03,
|
|
338
|
+
expected_drift_rate: float = 0.1,
|
|
339
|
+
num_columns: int = 10,
|
|
340
|
+
) -> SampleSizeEstimate:
|
|
341
|
+
"""Estimate optimal sample size for drift detection.
|
|
342
|
+
|
|
343
|
+
Uses Cochran's formula adjusted for drift detection requirements.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
population_size: Total number of rows in the dataset.
|
|
347
|
+
confidence_level: Target confidence level (default 0.95).
|
|
348
|
+
margin_of_error: Acceptable margin of error (default 0.03 = 3%).
|
|
349
|
+
expected_drift_rate: Expected proportion of drifted values (default 0.1).
|
|
350
|
+
num_columns: Number of columns to analyze (affects computation time).
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
SampleSizeEstimate with recommended sizes and estimates.
|
|
354
|
+
"""
|
|
355
|
+
# Z-scores for common confidence levels
|
|
356
|
+
z_scores = {
|
|
357
|
+
0.90: 1.645,
|
|
358
|
+
0.95: 1.96,
|
|
359
|
+
0.99: 2.576,
|
|
360
|
+
}
|
|
361
|
+
z = z_scores.get(confidence_level, 1.96)
|
|
362
|
+
|
|
363
|
+
# Cochran's formula for sample size
|
|
364
|
+
p = expected_drift_rate
|
|
365
|
+
q = 1 - p
|
|
366
|
+
n0 = (z ** 2 * p * q) / (margin_of_error ** 2)
|
|
367
|
+
|
|
368
|
+
# Finite population correction
|
|
369
|
+
if population_size > 0:
|
|
370
|
+
n = n0 / (1 + (n0 - 1) / population_size)
|
|
371
|
+
else:
|
|
372
|
+
n = n0
|
|
373
|
+
|
|
374
|
+
recommended = int(math.ceil(n))
|
|
375
|
+
|
|
376
|
+
# Minimum sample size for statistical validity
|
|
377
|
+
min_size = max(100, int(recommended * 0.3))
|
|
378
|
+
|
|
379
|
+
# Maximum useful sample size (diminishing returns)
|
|
380
|
+
max_size = min(population_size, int(recommended * 3))
|
|
381
|
+
|
|
382
|
+
# Ensure ordering
|
|
383
|
+
recommended = max(min_size, min(recommended, max_size))
|
|
384
|
+
|
|
385
|
+
# Estimate processing time (rough heuristic)
|
|
386
|
+
# Assume ~10,000 rows/second per column
|
|
387
|
+
rows_per_second = 10000
|
|
388
|
+
estimated_time = (recommended * num_columns) / rows_per_second
|
|
389
|
+
|
|
390
|
+
# Estimate memory usage (~100 bytes per row per column for numeric data)
|
|
391
|
+
bytes_per_row = 100 * num_columns
|
|
392
|
+
memory_mb = (recommended * bytes_per_row) / (1024 * 1024)
|
|
393
|
+
|
|
394
|
+
return SampleSizeEstimate(
|
|
395
|
+
recommended_size=recommended,
|
|
396
|
+
min_size=min_size,
|
|
397
|
+
max_size=max_size,
|
|
398
|
+
confidence_level=confidence_level,
|
|
399
|
+
margin_of_error=margin_of_error,
|
|
400
|
+
estimated_time_seconds=round(estimated_time, 2),
|
|
401
|
+
memory_mb=round(memory_mb, 2),
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def calculate_chunk_size(
|
|
406
|
+
total_rows: int,
|
|
407
|
+
available_memory_mb: float = 1024,
|
|
408
|
+
bytes_per_row: int = 1000,
|
|
409
|
+
target_chunks: int | None = None,
|
|
410
|
+
) -> int:
|
|
411
|
+
"""Calculate optimal chunk size for processing large datasets.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
total_rows: Total number of rows to process.
|
|
415
|
+
available_memory_mb: Available memory in MB.
|
|
416
|
+
bytes_per_row: Estimated bytes per row.
|
|
417
|
+
target_chunks: Target number of chunks (optional).
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Optimal chunk size in rows.
|
|
421
|
+
"""
|
|
422
|
+
# Maximum rows that fit in memory
|
|
423
|
+
max_rows_in_memory = int((available_memory_mb * 1024 * 1024) / bytes_per_row)
|
|
424
|
+
|
|
425
|
+
# Use 80% of available memory for safety
|
|
426
|
+
safe_chunk_size = int(max_rows_in_memory * 0.8)
|
|
427
|
+
|
|
428
|
+
if target_chunks:
|
|
429
|
+
# Calculate chunk size to achieve target number of chunks
|
|
430
|
+
target_chunk_size = total_rows // target_chunks
|
|
431
|
+
# Use the smaller of target and safe size
|
|
432
|
+
chunk_size = min(target_chunk_size, safe_chunk_size)
|
|
433
|
+
else:
|
|
434
|
+
chunk_size = safe_chunk_size
|
|
435
|
+
|
|
436
|
+
# Ensure reasonable bounds
|
|
437
|
+
min_chunk_size = 10000
|
|
438
|
+
max_chunk_size = 10_000_000
|
|
439
|
+
|
|
440
|
+
return max(min_chunk_size, min(chunk_size, max_chunk_size))
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def should_early_stop(
|
|
444
|
+
columns_with_drift: list[str],
|
|
445
|
+
total_columns: int,
|
|
446
|
+
threshold: float = 0.5,
|
|
447
|
+
min_processed: int = 3,
|
|
448
|
+
) -> bool:
|
|
449
|
+
"""Determine if early stopping should be triggered.
|
|
450
|
+
|
|
451
|
+
Early stopping is useful when drift is obvious and processing
|
|
452
|
+
more data won't change the conclusion.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
columns_with_drift: List of columns where drift was detected.
|
|
456
|
+
total_columns: Total number of columns being analyzed.
|
|
457
|
+
threshold: Proportion of drifted columns to trigger early stop.
|
|
458
|
+
min_processed: Minimum columns to process before considering early stop.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
True if early stopping should be triggered.
|
|
462
|
+
"""
|
|
463
|
+
if len(columns_with_drift) < min_processed:
|
|
464
|
+
return False
|
|
465
|
+
|
|
466
|
+
drift_rate = len(columns_with_drift) / total_columns
|
|
467
|
+
return drift_rate >= threshold
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class ChunkedComparisonTracker:
|
|
471
|
+
"""Tracks progress of chunked comparison operations.
|
|
472
|
+
|
|
473
|
+
Thread-safe progress tracking for long-running drift detection jobs.
|
|
474
|
+
"""
|
|
475
|
+
|
|
476
|
+
def __init__(
|
|
477
|
+
self,
|
|
478
|
+
total_rows: int,
|
|
479
|
+
chunk_size: int,
|
|
480
|
+
total_columns: int,
|
|
481
|
+
) -> None:
|
|
482
|
+
"""Initialize the progress tracker.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
total_rows: Total rows to process.
|
|
486
|
+
chunk_size: Size of each chunk.
|
|
487
|
+
total_columns: Number of columns being compared.
|
|
488
|
+
"""
|
|
489
|
+
self.total_rows = total_rows
|
|
490
|
+
self.chunk_size = chunk_size
|
|
491
|
+
self.total_columns = total_columns
|
|
492
|
+
self.total_chunks = math.ceil(total_rows / chunk_size)
|
|
493
|
+
|
|
494
|
+
self.processed_chunks = 0
|
|
495
|
+
self.processed_rows = 0
|
|
496
|
+
self.current_chunk = 0
|
|
497
|
+
self.columns_with_drift: list[str] = []
|
|
498
|
+
self.early_stop_triggered = False
|
|
499
|
+
self.status = "running"
|
|
500
|
+
|
|
501
|
+
self._start_time: float | None = None
|
|
502
|
+
self._chunk_times: list[float] = []
|
|
503
|
+
|
|
504
|
+
def start(self) -> None:
|
|
505
|
+
"""Mark the start of processing."""
|
|
506
|
+
import time
|
|
507
|
+
|
|
508
|
+
self._start_time = time.time()
|
|
509
|
+
self.status = "running"
|
|
510
|
+
|
|
511
|
+
def update_chunk(
|
|
512
|
+
self,
|
|
513
|
+
chunk_index: int,
|
|
514
|
+
rows_in_chunk: int,
|
|
515
|
+
drifted_columns: list[str],
|
|
516
|
+
chunk_time: float,
|
|
517
|
+
) -> None:
|
|
518
|
+
"""Update progress after processing a chunk.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
chunk_index: Index of the completed chunk.
|
|
522
|
+
rows_in_chunk: Number of rows in this chunk.
|
|
523
|
+
drifted_columns: Columns with drift detected in this chunk.
|
|
524
|
+
chunk_time: Time taken to process this chunk.
|
|
525
|
+
"""
|
|
526
|
+
self.current_chunk = chunk_index + 1
|
|
527
|
+
self.processed_chunks = chunk_index + 1
|
|
528
|
+
self.processed_rows += rows_in_chunk
|
|
529
|
+
self._chunk_times.append(chunk_time)
|
|
530
|
+
|
|
531
|
+
# Merge drifted columns
|
|
532
|
+
for col in drifted_columns:
|
|
533
|
+
if col not in self.columns_with_drift:
|
|
534
|
+
self.columns_with_drift.append(col)
|
|
535
|
+
|
|
536
|
+
def trigger_early_stop(self) -> None:
|
|
537
|
+
"""Trigger early stopping."""
|
|
538
|
+
self.early_stop_triggered = True
|
|
539
|
+
self.status = "completed"
|
|
540
|
+
|
|
541
|
+
def complete(self) -> None:
|
|
542
|
+
"""Mark processing as complete."""
|
|
543
|
+
self.status = "completed"
|
|
544
|
+
|
|
545
|
+
def cancel(self) -> None:
|
|
546
|
+
"""Mark processing as cancelled."""
|
|
547
|
+
self.status = "cancelled"
|
|
548
|
+
|
|
549
|
+
def error(self, message: str) -> None:
|
|
550
|
+
"""Mark processing as failed."""
|
|
551
|
+
self.status = "error"
|
|
552
|
+
logger.error(f"Chunked comparison failed: {message}")
|
|
553
|
+
|
|
554
|
+
def get_progress(self) -> ChunkedComparisonProgress:
|
|
555
|
+
"""Get current progress status.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Current progress information.
|
|
559
|
+
"""
|
|
560
|
+
import time
|
|
561
|
+
|
|
562
|
+
elapsed = time.time() - self._start_time if self._start_time else 0.0
|
|
563
|
+
|
|
564
|
+
# Estimate remaining time based on average chunk time
|
|
565
|
+
if self._chunk_times and self.processed_chunks < self.total_chunks:
|
|
566
|
+
avg_chunk_time = sum(self._chunk_times) / len(self._chunk_times)
|
|
567
|
+
remaining_chunks = self.total_chunks - self.processed_chunks
|
|
568
|
+
estimated_remaining = avg_chunk_time * remaining_chunks
|
|
569
|
+
else:
|
|
570
|
+
estimated_remaining = 0.0
|
|
571
|
+
|
|
572
|
+
return ChunkedComparisonProgress(
|
|
573
|
+
total_chunks=self.total_chunks,
|
|
574
|
+
processed_chunks=self.processed_chunks,
|
|
575
|
+
total_rows=self.total_rows,
|
|
576
|
+
processed_rows=self.processed_rows,
|
|
577
|
+
current_chunk=self.current_chunk,
|
|
578
|
+
elapsed_seconds=round(elapsed, 2),
|
|
579
|
+
estimated_remaining_seconds=round(estimated_remaining, 2),
|
|
580
|
+
columns_with_drift=self.columns_with_drift.copy(),
|
|
581
|
+
early_stop_triggered=self.early_stop_triggered,
|
|
582
|
+
status=self.status,
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
async def parallel_column_compare(
|
|
587
|
+
baseline_column_data: dict[str, list[Any]],
|
|
588
|
+
current_column_data: dict[str, list[Any]],
|
|
589
|
+
method: str = "auto",
|
|
590
|
+
threshold: float = 0.05,
|
|
591
|
+
max_workers: int = 4,
|
|
592
|
+
) -> dict[str, dict[str, Any]]:
|
|
593
|
+
"""Compare multiple columns in parallel.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
baseline_column_data: Dict mapping column names to baseline values.
|
|
597
|
+
current_column_data: Dict mapping column names to current values.
|
|
598
|
+
method: Drift detection method.
|
|
599
|
+
threshold: Drift threshold.
|
|
600
|
+
max_workers: Maximum parallel workers.
|
|
601
|
+
|
|
602
|
+
Returns:
|
|
603
|
+
Dict mapping column names to drift results.
|
|
604
|
+
"""
|
|
605
|
+
import asyncio
|
|
606
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
607
|
+
|
|
608
|
+
async def compare_column(column: str) -> tuple[str, dict[str, Any]]:
|
|
609
|
+
"""Compare a single column."""
|
|
610
|
+
baseline = baseline_column_data.get(column, [])
|
|
611
|
+
current = current_column_data.get(column, [])
|
|
612
|
+
|
|
613
|
+
# Simple drift detection (placeholder - actual implementation would use truthound)
|
|
614
|
+
if not baseline or not current:
|
|
615
|
+
return column, {"drifted": False, "error": "No data"}
|
|
616
|
+
|
|
617
|
+
try:
|
|
618
|
+
baseline_arr = np.array(baseline, dtype=float)
|
|
619
|
+
current_arr = np.array(current, dtype=float)
|
|
620
|
+
|
|
621
|
+
# Simple statistical comparison
|
|
622
|
+
baseline_mean = np.mean(baseline_arr)
|
|
623
|
+
current_mean = np.mean(current_arr)
|
|
624
|
+
baseline_std = np.std(baseline_arr)
|
|
625
|
+
|
|
626
|
+
# Z-score test for mean shift
|
|
627
|
+
if baseline_std > 0:
|
|
628
|
+
z_score = abs(current_mean - baseline_mean) / baseline_std
|
|
629
|
+
drifted = z_score > 2.0 # Simplified threshold
|
|
630
|
+
else:
|
|
631
|
+
drifted = abs(current_mean - baseline_mean) > threshold
|
|
632
|
+
|
|
633
|
+
return column, {
|
|
634
|
+
"drifted": drifted,
|
|
635
|
+
"baseline_mean": float(baseline_mean),
|
|
636
|
+
"current_mean": float(current_mean),
|
|
637
|
+
"baseline_std": float(baseline_std),
|
|
638
|
+
"method": method,
|
|
639
|
+
}
|
|
640
|
+
except (ValueError, TypeError):
|
|
641
|
+
# Non-numeric column - use categorical comparison
|
|
642
|
+
baseline_set = set(baseline)
|
|
643
|
+
current_set = set(current)
|
|
644
|
+
new_values = current_set - baseline_set
|
|
645
|
+
drifted = len(new_values) > len(baseline_set) * threshold
|
|
646
|
+
|
|
647
|
+
return column, {
|
|
648
|
+
"drifted": drifted,
|
|
649
|
+
"new_categories": list(new_values)[:10],
|
|
650
|
+
"method": "categorical",
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
# Run comparisons in parallel
|
|
654
|
+
columns = list(baseline_column_data.keys())
|
|
655
|
+
|
|
656
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
657
|
+
loop = asyncio.get_event_loop()
|
|
658
|
+
tasks = [
|
|
659
|
+
loop.run_in_executor(executor, lambda c=col: asyncio.run(compare_column(c)))
|
|
660
|
+
for col in columns
|
|
661
|
+
]
|
|
662
|
+
|
|
663
|
+
# Actually run async
|
|
664
|
+
results = {}
|
|
665
|
+
for column in columns:
|
|
666
|
+
col_name, col_result = await compare_column(column)
|
|
667
|
+
results[col_name] = col_result
|
|
668
|
+
|
|
669
|
+
return results
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Internationalization module for truthound-dashboard.
|
|
2
|
+
|
|
3
|
+
This module provides unified i18n support for:
|
|
4
|
+
- API error messages
|
|
5
|
+
- Validation result messages
|
|
6
|
+
- User-facing content
|
|
7
|
+
|
|
8
|
+
It reuses the SupportedLocale enum from reporters/i18n for consistency.
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
from truthound_dashboard.core.i18n import (
|
|
12
|
+
get_message,
|
|
13
|
+
detect_locale,
|
|
14
|
+
SupportedLocale,
|
|
15
|
+
SUPPORTED_LOCALES,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Get error message in detected locale
|
|
19
|
+
locale = detect_locale(request)
|
|
20
|
+
message = get_message("error.source_not_found", locale)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from truthound_dashboard.core.reporters.i18n.base import SupportedLocale
|
|
24
|
+
|
|
25
|
+
from .detector import detect_locale, parse_accept_language
|
|
26
|
+
from .messages import ERROR_MESSAGES, get_all_messages, get_message
|
|
27
|
+
|
|
28
|
+
# All supported locale codes
|
|
29
|
+
SUPPORTED_LOCALES = [locale.value for locale in SupportedLocale]
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Core classes
|
|
33
|
+
"SupportedLocale",
|
|
34
|
+
# Functions
|
|
35
|
+
"get_message",
|
|
36
|
+
"get_all_messages",
|
|
37
|
+
"detect_locale",
|
|
38
|
+
"parse_accept_language",
|
|
39
|
+
# Constants
|
|
40
|
+
"SUPPORTED_LOCALES",
|
|
41
|
+
"ERROR_MESSAGES",
|
|
42
|
+
]
|