truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
  164. truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
  166. truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,669 @@
1
+ """Drift sampling strategies for large-scale datasets.
2
+
3
+ This module provides various sampling strategies optimized for drift detection
4
+ on 100M+ row datasets. It includes random, stratified, and reservoir sampling
5
+ with automatic sample size estimation based on confidence levels.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import math
12
+ import random
13
+ from abc import ABC, abstractmethod
14
+ from dataclasses import dataclass
15
+ from enum import Enum
16
+ from typing import Any, Iterator, Sequence, TypeVar
17
+
18
+ import numpy as np
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ T = TypeVar("T")
23
+
24
+
25
+ class SamplingMethod(str, Enum):
26
+ """Available sampling methods for large datasets."""
27
+
28
+ RANDOM = "random"
29
+ STRATIFIED = "stratified"
30
+ RESERVOIR = "reservoir"
31
+ SYSTEMATIC = "systematic"
32
+
33
+
34
+ @dataclass
35
+ class SampleSizeEstimate:
36
+ """Estimated sample size for drift detection.
37
+
38
+ Attributes:
39
+ recommended_size: Recommended sample size for target confidence.
40
+ min_size: Minimum sample size for basic detection.
41
+ max_size: Maximum useful sample size (diminishing returns beyond).
42
+ confidence_level: Target confidence level (0.0-1.0).
43
+ margin_of_error: Expected margin of error at recommended size.
44
+ estimated_time_seconds: Estimated processing time.
45
+ memory_mb: Estimated memory usage in MB.
46
+ """
47
+
48
+ recommended_size: int
49
+ min_size: int
50
+ max_size: int
51
+ confidence_level: float
52
+ margin_of_error: float
53
+ estimated_time_seconds: float
54
+ memory_mb: float
55
+
56
+
57
+ @dataclass
58
+ class ChunkedComparisonProgress:
59
+ """Progress tracking for chunked comparison operations.
60
+
61
+ Attributes:
62
+ total_chunks: Total number of chunks to process.
63
+ processed_chunks: Number of chunks already processed.
64
+ total_rows: Total rows across all chunks.
65
+ processed_rows: Rows processed so far.
66
+ current_chunk: Current chunk being processed.
67
+ elapsed_seconds: Time elapsed since start.
68
+ estimated_remaining_seconds: Estimated time remaining.
69
+ columns_with_drift: Columns detected with drift so far.
70
+ early_stop_triggered: Whether early stopping was triggered.
71
+ status: Current status (running, completed, cancelled, error).
72
+ """
73
+
74
+ total_chunks: int
75
+ processed_chunks: int
76
+ total_rows: int
77
+ processed_rows: int
78
+ current_chunk: int
79
+ elapsed_seconds: float
80
+ estimated_remaining_seconds: float
81
+ columns_with_drift: list[str]
82
+ early_stop_triggered: bool
83
+ status: str # running, completed, cancelled, error
84
+
85
+
86
+ class BaseSampler(ABC):
87
+ """Abstract base class for sampling strategies."""
88
+
89
+ @abstractmethod
90
+ def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
91
+ """Sample data from the input sequence.
92
+
93
+ Args:
94
+ data: Input data sequence.
95
+ sample_size: Number of samples to extract.
96
+
97
+ Returns:
98
+ List of sampled items.
99
+ """
100
+ pass
101
+
102
+ @abstractmethod
103
+ def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
104
+ """Generate sample indices for a dataset of given size.
105
+
106
+ Args:
107
+ total_size: Total number of rows in dataset.
108
+ sample_size: Number of samples to extract.
109
+
110
+ Returns:
111
+ List of indices to sample.
112
+ """
113
+ pass
114
+
115
+
116
+ class RandomSampler(BaseSampler):
117
+ """Simple random sampling without replacement.
118
+
119
+ Best for: General-purpose sampling when no stratification is needed.
120
+ Time complexity: O(n) for data, O(k) for indices where k is sample size.
121
+ """
122
+
123
+ def __init__(self, seed: int | None = None) -> None:
124
+ """Initialize random sampler.
125
+
126
+ Args:
127
+ seed: Random seed for reproducibility.
128
+ """
129
+ self.rng = random.Random(seed)
130
+
131
+ def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
132
+ """Perform random sampling on data."""
133
+ if sample_size >= len(data):
134
+ return list(data)
135
+ indices = self.sample_indices(len(data), sample_size)
136
+ return [data[i] for i in indices]
137
+
138
+ def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
139
+ """Generate random sample indices."""
140
+ if sample_size >= total_size:
141
+ return list(range(total_size))
142
+ return self.rng.sample(range(total_size), sample_size)
143
+
144
+
145
+ class StratifiedSampler(BaseSampler):
146
+ """Stratified sampling based on a stratification column.
147
+
148
+ Best for: Ensuring representation of all categories in drift detection.
149
+ Time complexity: O(n) where n is data size.
150
+ """
151
+
152
+ def __init__(
153
+ self,
154
+ strata_column: str | int,
155
+ seed: int | None = None,
156
+ ) -> None:
157
+ """Initialize stratified sampler.
158
+
159
+ Args:
160
+ strata_column: Column name or index for stratification.
161
+ seed: Random seed for reproducibility.
162
+ """
163
+ self.strata_column = strata_column
164
+ self.rng = random.Random(seed)
165
+
166
+ def sample(self, data: Sequence[dict[str, Any]], sample_size: int) -> list[dict[str, Any]]:
167
+ """Perform stratified sampling.
168
+
169
+ Args:
170
+ data: Input data with dict rows.
171
+ sample_size: Total number of samples.
172
+
173
+ Returns:
174
+ Stratified sample maintaining proportions.
175
+ """
176
+ if sample_size >= len(data):
177
+ return list(data)
178
+
179
+ # Group by strata
180
+ strata: dict[Any, list[int]] = {}
181
+ for i, row in enumerate(data):
182
+ key = row.get(self.strata_column) if isinstance(row, dict) else row[self.strata_column]
183
+ if key not in strata:
184
+ strata[key] = []
185
+ strata[key].append(i)
186
+
187
+ # Calculate samples per stratum (proportional)
188
+ total = len(data)
189
+ result_indices: list[int] = []
190
+
191
+ for stratum_key, indices in strata.items():
192
+ proportion = len(indices) / total
193
+ stratum_sample_size = max(1, int(sample_size * proportion))
194
+ stratum_sample_size = min(stratum_sample_size, len(indices))
195
+ sampled = self.rng.sample(indices, stratum_sample_size)
196
+ result_indices.extend(sampled)
197
+
198
+ # If we have fewer samples than requested, top up randomly
199
+ if len(result_indices) < sample_size:
200
+ remaining = set(range(total)) - set(result_indices)
201
+ additional = self.rng.sample(
202
+ list(remaining), min(sample_size - len(result_indices), len(remaining))
203
+ )
204
+ result_indices.extend(additional)
205
+
206
+ return [data[i] for i in result_indices[:sample_size]]
207
+
208
+ def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
209
+ """Generate stratified indices (falls back to random without strata info)."""
210
+ if sample_size >= total_size:
211
+ return list(range(total_size))
212
+ return self.rng.sample(range(total_size), sample_size)
213
+
214
+
215
+ class ReservoirSampler(BaseSampler):
216
+ """Reservoir sampling for streaming data (Algorithm R).
217
+
218
+ Best for: Single-pass sampling of very large datasets or streams.
219
+ Time complexity: O(n) single pass.
220
+ Space complexity: O(k) where k is sample size.
221
+ """
222
+
223
+ def __init__(self, seed: int | None = None) -> None:
224
+ """Initialize reservoir sampler.
225
+
226
+ Args:
227
+ seed: Random seed for reproducibility.
228
+ """
229
+ self.rng = random.Random(seed)
230
+
231
+ def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
232
+ """Perform reservoir sampling."""
233
+ return list(self.sample_stream(iter(data), sample_size))
234
+
235
+ def sample_stream(self, stream: Iterator[T], sample_size: int) -> Iterator[T]:
236
+ """Sample from a stream using reservoir sampling.
237
+
238
+ Args:
239
+ stream: Input data stream.
240
+ sample_size: Number of samples to maintain.
241
+
242
+ Yields:
243
+ Sampled items after stream is exhausted.
244
+ """
245
+ reservoir: list[T] = []
246
+
247
+ for i, item in enumerate(stream):
248
+ if i < sample_size:
249
+ reservoir.append(item)
250
+ else:
251
+ # Replace with decreasing probability
252
+ j = self.rng.randint(0, i)
253
+ if j < sample_size:
254
+ reservoir[j] = item
255
+
256
+ yield from reservoir
257
+
258
+ def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
259
+ """Generate reservoir-style indices."""
260
+ if sample_size >= total_size:
261
+ return list(range(total_size))
262
+
263
+ reservoir = list(range(sample_size))
264
+ for i in range(sample_size, total_size):
265
+ j = self.rng.randint(0, i)
266
+ if j < sample_size:
267
+ reservoir[j] = i
268
+
269
+ return sorted(reservoir)
270
+
271
+
272
+ class SystematicSampler(BaseSampler):
273
+ """Systematic sampling with random start.
274
+
275
+ Best for: Evenly distributed sampling across ordered data.
276
+ Time complexity: O(k) where k is sample size.
277
+ """
278
+
279
+ def __init__(self, seed: int | None = None) -> None:
280
+ """Initialize systematic sampler.
281
+
282
+ Args:
283
+ seed: Random seed for reproducibility.
284
+ """
285
+ self.rng = random.Random(seed)
286
+
287
+ def sample(self, data: Sequence[T], sample_size: int) -> list[T]:
288
+ """Perform systematic sampling."""
289
+ indices = self.sample_indices(len(data), sample_size)
290
+ return [data[i] for i in indices]
291
+
292
+ def sample_indices(self, total_size: int, sample_size: int) -> list[int]:
293
+ """Generate systematic sample indices."""
294
+ if sample_size >= total_size:
295
+ return list(range(total_size))
296
+
297
+ interval = total_size / sample_size
298
+ start = self.rng.uniform(0, interval)
299
+
300
+ indices = []
301
+ for i in range(sample_size):
302
+ idx = int(start + i * interval)
303
+ if idx < total_size:
304
+ indices.append(idx)
305
+
306
+ return indices
307
+
308
+
309
+ def get_sampler(method: SamplingMethod, **kwargs) -> BaseSampler:
310
+ """Factory function to get the appropriate sampler.
311
+
312
+ Args:
313
+ method: Sampling method to use.
314
+ **kwargs: Additional arguments for the sampler.
315
+
316
+ Returns:
317
+ Configured sampler instance.
318
+ """
319
+ seed = kwargs.get("seed")
320
+
321
+ if method == SamplingMethod.RANDOM:
322
+ return RandomSampler(seed=seed)
323
+ elif method == SamplingMethod.STRATIFIED:
324
+ strata_column = kwargs.get("strata_column", 0)
325
+ return StratifiedSampler(strata_column=strata_column, seed=seed)
326
+ elif method == SamplingMethod.RESERVOIR:
327
+ return ReservoirSampler(seed=seed)
328
+ elif method == SamplingMethod.SYSTEMATIC:
329
+ return SystematicSampler(seed=seed)
330
+ else:
331
+ raise ValueError(f"Unknown sampling method: {method}")
332
+
333
+
334
+ def estimate_sample_size(
335
+ population_size: int,
336
+ confidence_level: float = 0.95,
337
+ margin_of_error: float = 0.03,
338
+ expected_drift_rate: float = 0.1,
339
+ num_columns: int = 10,
340
+ ) -> SampleSizeEstimate:
341
+ """Estimate optimal sample size for drift detection.
342
+
343
+ Uses Cochran's formula adjusted for drift detection requirements.
344
+
345
+ Args:
346
+ population_size: Total number of rows in the dataset.
347
+ confidence_level: Target confidence level (default 0.95).
348
+ margin_of_error: Acceptable margin of error (default 0.03 = 3%).
349
+ expected_drift_rate: Expected proportion of drifted values (default 0.1).
350
+ num_columns: Number of columns to analyze (affects computation time).
351
+
352
+ Returns:
353
+ SampleSizeEstimate with recommended sizes and estimates.
354
+ """
355
+ # Z-scores for common confidence levels
356
+ z_scores = {
357
+ 0.90: 1.645,
358
+ 0.95: 1.96,
359
+ 0.99: 2.576,
360
+ }
361
+ z = z_scores.get(confidence_level, 1.96)
362
+
363
+ # Cochran's formula for sample size
364
+ p = expected_drift_rate
365
+ q = 1 - p
366
+ n0 = (z ** 2 * p * q) / (margin_of_error ** 2)
367
+
368
+ # Finite population correction
369
+ if population_size > 0:
370
+ n = n0 / (1 + (n0 - 1) / population_size)
371
+ else:
372
+ n = n0
373
+
374
+ recommended = int(math.ceil(n))
375
+
376
+ # Minimum sample size for statistical validity
377
+ min_size = max(100, int(recommended * 0.3))
378
+
379
+ # Maximum useful sample size (diminishing returns)
380
+ max_size = min(population_size, int(recommended * 3))
381
+
382
+ # Ensure ordering
383
+ recommended = max(min_size, min(recommended, max_size))
384
+
385
+ # Estimate processing time (rough heuristic)
386
+ # Assume ~10,000 rows/second per column
387
+ rows_per_second = 10000
388
+ estimated_time = (recommended * num_columns) / rows_per_second
389
+
390
+ # Estimate memory usage (~100 bytes per row per column for numeric data)
391
+ bytes_per_row = 100 * num_columns
392
+ memory_mb = (recommended * bytes_per_row) / (1024 * 1024)
393
+
394
+ return SampleSizeEstimate(
395
+ recommended_size=recommended,
396
+ min_size=min_size,
397
+ max_size=max_size,
398
+ confidence_level=confidence_level,
399
+ margin_of_error=margin_of_error,
400
+ estimated_time_seconds=round(estimated_time, 2),
401
+ memory_mb=round(memory_mb, 2),
402
+ )
403
+
404
+
405
+ def calculate_chunk_size(
406
+ total_rows: int,
407
+ available_memory_mb: float = 1024,
408
+ bytes_per_row: int = 1000,
409
+ target_chunks: int | None = None,
410
+ ) -> int:
411
+ """Calculate optimal chunk size for processing large datasets.
412
+
413
+ Args:
414
+ total_rows: Total number of rows to process.
415
+ available_memory_mb: Available memory in MB.
416
+ bytes_per_row: Estimated bytes per row.
417
+ target_chunks: Target number of chunks (optional).
418
+
419
+ Returns:
420
+ Optimal chunk size in rows.
421
+ """
422
+ # Maximum rows that fit in memory
423
+ max_rows_in_memory = int((available_memory_mb * 1024 * 1024) / bytes_per_row)
424
+
425
+ # Use 80% of available memory for safety
426
+ safe_chunk_size = int(max_rows_in_memory * 0.8)
427
+
428
+ if target_chunks:
429
+ # Calculate chunk size to achieve target number of chunks
430
+ target_chunk_size = total_rows // target_chunks
431
+ # Use the smaller of target and safe size
432
+ chunk_size = min(target_chunk_size, safe_chunk_size)
433
+ else:
434
+ chunk_size = safe_chunk_size
435
+
436
+ # Ensure reasonable bounds
437
+ min_chunk_size = 10000
438
+ max_chunk_size = 10_000_000
439
+
440
+ return max(min_chunk_size, min(chunk_size, max_chunk_size))
441
+
442
+
443
+ def should_early_stop(
444
+ columns_with_drift: list[str],
445
+ total_columns: int,
446
+ threshold: float = 0.5,
447
+ min_processed: int = 3,
448
+ ) -> bool:
449
+ """Determine if early stopping should be triggered.
450
+
451
+ Early stopping is useful when drift is obvious and processing
452
+ more data won't change the conclusion.
453
+
454
+ Args:
455
+ columns_with_drift: List of columns where drift was detected.
456
+ total_columns: Total number of columns being analyzed.
457
+ threshold: Proportion of drifted columns to trigger early stop.
458
+ min_processed: Minimum columns to process before considering early stop.
459
+
460
+ Returns:
461
+ True if early stopping should be triggered.
462
+ """
463
+ if len(columns_with_drift) < min_processed:
464
+ return False
465
+
466
+ drift_rate = len(columns_with_drift) / total_columns
467
+ return drift_rate >= threshold
468
+
469
+
470
+ class ChunkedComparisonTracker:
471
+ """Tracks progress of chunked comparison operations.
472
+
473
+ Thread-safe progress tracking for long-running drift detection jobs.
474
+ """
475
+
476
+ def __init__(
477
+ self,
478
+ total_rows: int,
479
+ chunk_size: int,
480
+ total_columns: int,
481
+ ) -> None:
482
+ """Initialize the progress tracker.
483
+
484
+ Args:
485
+ total_rows: Total rows to process.
486
+ chunk_size: Size of each chunk.
487
+ total_columns: Number of columns being compared.
488
+ """
489
+ self.total_rows = total_rows
490
+ self.chunk_size = chunk_size
491
+ self.total_columns = total_columns
492
+ self.total_chunks = math.ceil(total_rows / chunk_size)
493
+
494
+ self.processed_chunks = 0
495
+ self.processed_rows = 0
496
+ self.current_chunk = 0
497
+ self.columns_with_drift: list[str] = []
498
+ self.early_stop_triggered = False
499
+ self.status = "running"
500
+
501
+ self._start_time: float | None = None
502
+ self._chunk_times: list[float] = []
503
+
504
+ def start(self) -> None:
505
+ """Mark the start of processing."""
506
+ import time
507
+
508
+ self._start_time = time.time()
509
+ self.status = "running"
510
+
511
+ def update_chunk(
512
+ self,
513
+ chunk_index: int,
514
+ rows_in_chunk: int,
515
+ drifted_columns: list[str],
516
+ chunk_time: float,
517
+ ) -> None:
518
+ """Update progress after processing a chunk.
519
+
520
+ Args:
521
+ chunk_index: Index of the completed chunk.
522
+ rows_in_chunk: Number of rows in this chunk.
523
+ drifted_columns: Columns with drift detected in this chunk.
524
+ chunk_time: Time taken to process this chunk.
525
+ """
526
+ self.current_chunk = chunk_index + 1
527
+ self.processed_chunks = chunk_index + 1
528
+ self.processed_rows += rows_in_chunk
529
+ self._chunk_times.append(chunk_time)
530
+
531
+ # Merge drifted columns
532
+ for col in drifted_columns:
533
+ if col not in self.columns_with_drift:
534
+ self.columns_with_drift.append(col)
535
+
536
+ def trigger_early_stop(self) -> None:
537
+ """Trigger early stopping."""
538
+ self.early_stop_triggered = True
539
+ self.status = "completed"
540
+
541
+ def complete(self) -> None:
542
+ """Mark processing as complete."""
543
+ self.status = "completed"
544
+
545
+ def cancel(self) -> None:
546
+ """Mark processing as cancelled."""
547
+ self.status = "cancelled"
548
+
549
+ def error(self, message: str) -> None:
550
+ """Mark processing as failed."""
551
+ self.status = "error"
552
+ logger.error(f"Chunked comparison failed: {message}")
553
+
554
+ def get_progress(self) -> ChunkedComparisonProgress:
555
+ """Get current progress status.
556
+
557
+ Returns:
558
+ Current progress information.
559
+ """
560
+ import time
561
+
562
+ elapsed = time.time() - self._start_time if self._start_time else 0.0
563
+
564
+ # Estimate remaining time based on average chunk time
565
+ if self._chunk_times and self.processed_chunks < self.total_chunks:
566
+ avg_chunk_time = sum(self._chunk_times) / len(self._chunk_times)
567
+ remaining_chunks = self.total_chunks - self.processed_chunks
568
+ estimated_remaining = avg_chunk_time * remaining_chunks
569
+ else:
570
+ estimated_remaining = 0.0
571
+
572
+ return ChunkedComparisonProgress(
573
+ total_chunks=self.total_chunks,
574
+ processed_chunks=self.processed_chunks,
575
+ total_rows=self.total_rows,
576
+ processed_rows=self.processed_rows,
577
+ current_chunk=self.current_chunk,
578
+ elapsed_seconds=round(elapsed, 2),
579
+ estimated_remaining_seconds=round(estimated_remaining, 2),
580
+ columns_with_drift=self.columns_with_drift.copy(),
581
+ early_stop_triggered=self.early_stop_triggered,
582
+ status=self.status,
583
+ )
584
+
585
+
586
+ async def parallel_column_compare(
587
+ baseline_column_data: dict[str, list[Any]],
588
+ current_column_data: dict[str, list[Any]],
589
+ method: str = "auto",
590
+ threshold: float = 0.05,
591
+ max_workers: int = 4,
592
+ ) -> dict[str, dict[str, Any]]:
593
+ """Compare multiple columns in parallel.
594
+
595
+ Args:
596
+ baseline_column_data: Dict mapping column names to baseline values.
597
+ current_column_data: Dict mapping column names to current values.
598
+ method: Drift detection method.
599
+ threshold: Drift threshold.
600
+ max_workers: Maximum parallel workers.
601
+
602
+ Returns:
603
+ Dict mapping column names to drift results.
604
+ """
605
+ import asyncio
606
+ from concurrent.futures import ThreadPoolExecutor
607
+
608
+ async def compare_column(column: str) -> tuple[str, dict[str, Any]]:
609
+ """Compare a single column."""
610
+ baseline = baseline_column_data.get(column, [])
611
+ current = current_column_data.get(column, [])
612
+
613
+ # Simple drift detection (placeholder - actual implementation would use truthound)
614
+ if not baseline or not current:
615
+ return column, {"drifted": False, "error": "No data"}
616
+
617
+ try:
618
+ baseline_arr = np.array(baseline, dtype=float)
619
+ current_arr = np.array(current, dtype=float)
620
+
621
+ # Simple statistical comparison
622
+ baseline_mean = np.mean(baseline_arr)
623
+ current_mean = np.mean(current_arr)
624
+ baseline_std = np.std(baseline_arr)
625
+
626
+ # Z-score test for mean shift
627
+ if baseline_std > 0:
628
+ z_score = abs(current_mean - baseline_mean) / baseline_std
629
+ drifted = z_score > 2.0 # Simplified threshold
630
+ else:
631
+ drifted = abs(current_mean - baseline_mean) > threshold
632
+
633
+ return column, {
634
+ "drifted": drifted,
635
+ "baseline_mean": float(baseline_mean),
636
+ "current_mean": float(current_mean),
637
+ "baseline_std": float(baseline_std),
638
+ "method": method,
639
+ }
640
+ except (ValueError, TypeError):
641
+ # Non-numeric column - use categorical comparison
642
+ baseline_set = set(baseline)
643
+ current_set = set(current)
644
+ new_values = current_set - baseline_set
645
+ drifted = len(new_values) > len(baseline_set) * threshold
646
+
647
+ return column, {
648
+ "drifted": drifted,
649
+ "new_categories": list(new_values)[:10],
650
+ "method": "categorical",
651
+ }
652
+
653
+ # Run comparisons in parallel
654
+ columns = list(baseline_column_data.keys())
655
+
656
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
657
+ loop = asyncio.get_event_loop()
658
+ tasks = [
659
+ loop.run_in_executor(executor, lambda c=col: asyncio.run(compare_column(c)))
660
+ for col in columns
661
+ ]
662
+
663
+ # Actually run async
664
+ results = {}
665
+ for column in columns:
666
+ col_name, col_result = await compare_column(column)
667
+ results[col_name] = col_result
668
+
669
+ return results
@@ -0,0 +1,42 @@
1
+ """Internationalization module for truthound-dashboard.
2
+
3
+ This module provides unified i18n support for:
4
+ - API error messages
5
+ - Validation result messages
6
+ - User-facing content
7
+
8
+ It reuses the SupportedLocale enum from reporters/i18n for consistency.
9
+
10
+ Example:
11
+ from truthound_dashboard.core.i18n import (
12
+ get_message,
13
+ detect_locale,
14
+ SupportedLocale,
15
+ SUPPORTED_LOCALES,
16
+ )
17
+
18
+ # Get error message in detected locale
19
+ locale = detect_locale(request)
20
+ message = get_message("error.source_not_found", locale)
21
+ """
22
+
23
+ from truthound_dashboard.core.reporters.i18n.base import SupportedLocale
24
+
25
+ from .detector import detect_locale, parse_accept_language
26
+ from .messages import ERROR_MESSAGES, get_all_messages, get_message
27
+
28
+ # All supported locale codes
29
+ SUPPORTED_LOCALES = [locale.value for locale in SupportedLocale]
30
+
31
+ __all__ = [
32
+ # Core classes
33
+ "SupportedLocale",
34
+ # Functions
35
+ "get_message",
36
+ "get_all_messages",
37
+ "detect_locale",
38
+ "parse_accept_language",
39
+ # Constants
40
+ "SUPPORTED_LOCALES",
41
+ "ERROR_MESSAGES",
42
+ ]