truthound-dashboard 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/__init__.py +11 -0
- truthound_dashboard/__main__.py +6 -0
- truthound_dashboard/api/__init__.py +15 -0
- truthound_dashboard/api/deps.py +153 -0
- truthound_dashboard/api/drift.py +179 -0
- truthound_dashboard/api/error_handlers.py +287 -0
- truthound_dashboard/api/health.py +78 -0
- truthound_dashboard/api/history.py +62 -0
- truthound_dashboard/api/middleware.py +626 -0
- truthound_dashboard/api/notifications.py +561 -0
- truthound_dashboard/api/profile.py +52 -0
- truthound_dashboard/api/router.py +83 -0
- truthound_dashboard/api/rules.py +277 -0
- truthound_dashboard/api/schedules.py +329 -0
- truthound_dashboard/api/schemas.py +136 -0
- truthound_dashboard/api/sources.py +229 -0
- truthound_dashboard/api/validations.py +125 -0
- truthound_dashboard/cli.py +226 -0
- truthound_dashboard/config.py +132 -0
- truthound_dashboard/core/__init__.py +264 -0
- truthound_dashboard/core/base.py +185 -0
- truthound_dashboard/core/cache.py +479 -0
- truthound_dashboard/core/connections.py +331 -0
- truthound_dashboard/core/encryption.py +409 -0
- truthound_dashboard/core/exceptions.py +627 -0
- truthound_dashboard/core/logging.py +488 -0
- truthound_dashboard/core/maintenance.py +542 -0
- truthound_dashboard/core/notifications/__init__.py +56 -0
- truthound_dashboard/core/notifications/base.py +390 -0
- truthound_dashboard/core/notifications/channels.py +557 -0
- truthound_dashboard/core/notifications/dispatcher.py +453 -0
- truthound_dashboard/core/notifications/events.py +155 -0
- truthound_dashboard/core/notifications/service.py +744 -0
- truthound_dashboard/core/sampling.py +626 -0
- truthound_dashboard/core/scheduler.py +311 -0
- truthound_dashboard/core/services.py +1531 -0
- truthound_dashboard/core/truthound_adapter.py +659 -0
- truthound_dashboard/db/__init__.py +67 -0
- truthound_dashboard/db/base.py +108 -0
- truthound_dashboard/db/database.py +196 -0
- truthound_dashboard/db/models.py +732 -0
- truthound_dashboard/db/repository.py +237 -0
- truthound_dashboard/main.py +309 -0
- truthound_dashboard/schemas/__init__.py +150 -0
- truthound_dashboard/schemas/base.py +96 -0
- truthound_dashboard/schemas/drift.py +118 -0
- truthound_dashboard/schemas/history.py +74 -0
- truthound_dashboard/schemas/profile.py +91 -0
- truthound_dashboard/schemas/rule.py +199 -0
- truthound_dashboard/schemas/schedule.py +88 -0
- truthound_dashboard/schemas/schema.py +121 -0
- truthound_dashboard/schemas/source.py +138 -0
- truthound_dashboard/schemas/validation.py +192 -0
- truthound_dashboard/static/assets/index-BqJMyAHX.js +110 -0
- truthound_dashboard/static/assets/index-DMDxHCTs.js +465 -0
- truthound_dashboard/static/assets/index-Dm2D11TK.css +1 -0
- truthound_dashboard/static/index.html +15 -0
- truthound_dashboard/static/mockServiceWorker.js +349 -0
- truthound_dashboard-1.0.0.dist-info/METADATA +218 -0
- truthound_dashboard-1.0.0.dist-info/RECORD +62 -0
- truthound_dashboard-1.0.0.dist-info/WHEEL +4 -0
- truthound_dashboard-1.0.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
"""Data sampling strategies for large dataset handling.
|
|
2
|
+
|
|
3
|
+
This module provides an extensible sampling system for handling large datasets
|
|
4
|
+
before validation. The Strategy pattern allows adding new sampling methods
|
|
5
|
+
without modifying existing code.
|
|
6
|
+
|
|
7
|
+
Supported formats:
|
|
8
|
+
- CSV files
|
|
9
|
+
- Parquet files
|
|
10
|
+
- JSON/JSONL files
|
|
11
|
+
|
|
12
|
+
Features:
|
|
13
|
+
- Automatic format detection
|
|
14
|
+
- Configurable size thresholds
|
|
15
|
+
- Multiple sampling strategies (random, head, stratified)
|
|
16
|
+
- Memory-efficient streaming for very large files
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
sampler = get_sampler()
|
|
20
|
+
|
|
21
|
+
# Check if sampling is needed
|
|
22
|
+
if sampler.needs_sampling("/path/to/large.csv"):
|
|
23
|
+
sampled_path = await sampler.sample("/path/to/large.csv", n=10000)
|
|
24
|
+
# Use sampled_path for validation
|
|
25
|
+
|
|
26
|
+
# Or use auto-sample which handles the logic
|
|
27
|
+
data_path = await sampler.auto_sample("/path/to/data.csv")
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import asyncio
|
|
33
|
+
import hashlib
|
|
34
|
+
import logging
|
|
35
|
+
from abc import ABC, abstractmethod
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
from enum import Enum
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
from truthound_dashboard.config import get_settings
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SamplingMethod(str, Enum):
|
|
47
|
+
"""Available sampling methods."""
|
|
48
|
+
|
|
49
|
+
RANDOM = "random" # Random sampling across entire dataset
|
|
50
|
+
HEAD = "head" # Take first N rows (fastest)
|
|
51
|
+
TAIL = "tail" # Take last N rows
|
|
52
|
+
STRATIFIED = "stratified" # Stratified sampling by column
|
|
53
|
+
RESERVOIR = "reservoir" # Reservoir sampling for streaming
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class SamplingConfig:
|
|
58
|
+
"""Configuration for data sampling.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
size_threshold_mb: File size threshold in MB to trigger sampling.
|
|
62
|
+
row_threshold: Row count threshold to trigger sampling.
|
|
63
|
+
default_sample_size: Default number of rows to sample.
|
|
64
|
+
method: Default sampling method.
|
|
65
|
+
seed: Random seed for reproducibility.
|
|
66
|
+
temp_dir: Directory for temporary sampled files.
|
|
67
|
+
cleanup_after_hours: Hours to keep temp files before cleanup.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
size_threshold_mb: float = 100.0
|
|
71
|
+
row_threshold: int = 1_000_000
|
|
72
|
+
default_sample_size: int = 10_000
|
|
73
|
+
method: SamplingMethod = SamplingMethod.RANDOM
|
|
74
|
+
seed: int = 42
|
|
75
|
+
temp_dir: Path | None = None
|
|
76
|
+
cleanup_after_hours: int = 24
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class SamplingResult:
|
|
81
|
+
"""Result of a sampling operation.
|
|
82
|
+
|
|
83
|
+
Attributes:
|
|
84
|
+
original_path: Path to original file.
|
|
85
|
+
sampled_path: Path to sampled file (same as original if no sampling).
|
|
86
|
+
was_sampled: Whether sampling was performed.
|
|
87
|
+
original_rows: Number of rows in original file.
|
|
88
|
+
sampled_rows: Number of rows in sampled file.
|
|
89
|
+
method: Sampling method used.
|
|
90
|
+
size_reduction_pct: Percentage reduction in file size.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
original_path: str
|
|
94
|
+
sampled_path: str
|
|
95
|
+
was_sampled: bool
|
|
96
|
+
original_rows: int | None = None
|
|
97
|
+
sampled_rows: int | None = None
|
|
98
|
+
method: SamplingMethod | None = None
|
|
99
|
+
size_reduction_pct: float = 0.0
|
|
100
|
+
|
|
101
|
+
def to_dict(self) -> dict[str, Any]:
|
|
102
|
+
"""Convert to dictionary."""
|
|
103
|
+
return {
|
|
104
|
+
"original_path": self.original_path,
|
|
105
|
+
"sampled_path": self.sampled_path,
|
|
106
|
+
"was_sampled": self.was_sampled,
|
|
107
|
+
"original_rows": self.original_rows,
|
|
108
|
+
"sampled_rows": self.sampled_rows,
|
|
109
|
+
"method": self.method.value if self.method else None,
|
|
110
|
+
"size_reduction_pct": round(self.size_reduction_pct, 2),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class SamplingStrategy(ABC):
|
|
115
|
+
"""Abstract base class for sampling strategies.
|
|
116
|
+
|
|
117
|
+
Subclass this to implement custom sampling methods.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def method(self) -> SamplingMethod:
|
|
123
|
+
"""Get the sampling method identifier."""
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def sample(
|
|
128
|
+
self,
|
|
129
|
+
df: Any,
|
|
130
|
+
n: int,
|
|
131
|
+
seed: int = 42,
|
|
132
|
+
**kwargs: Any,
|
|
133
|
+
) -> Any:
|
|
134
|
+
"""Sample data from a DataFrame.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
df: DataFrame to sample from.
|
|
138
|
+
n: Number of rows to sample.
|
|
139
|
+
seed: Random seed for reproducibility.
|
|
140
|
+
**kwargs: Additional strategy-specific arguments.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Sampled DataFrame.
|
|
144
|
+
"""
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class RandomSamplingStrategy(SamplingStrategy):
|
|
149
|
+
"""Random sampling strategy using reservoir sampling for efficiency."""
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def method(self) -> SamplingMethod:
|
|
153
|
+
return SamplingMethod.RANDOM
|
|
154
|
+
|
|
155
|
+
def sample(
|
|
156
|
+
self,
|
|
157
|
+
df: Any,
|
|
158
|
+
n: int,
|
|
159
|
+
seed: int = 42,
|
|
160
|
+
**kwargs: Any,
|
|
161
|
+
) -> Any:
|
|
162
|
+
"""Perform random sampling."""
|
|
163
|
+
|
|
164
|
+
if len(df) <= n:
|
|
165
|
+
return df
|
|
166
|
+
|
|
167
|
+
return df.sample(n=n, seed=seed)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class HeadSamplingStrategy(SamplingStrategy):
|
|
171
|
+
"""Head sampling strategy - take first N rows."""
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def method(self) -> SamplingMethod:
|
|
175
|
+
return SamplingMethod.HEAD
|
|
176
|
+
|
|
177
|
+
def sample(
|
|
178
|
+
self,
|
|
179
|
+
df: Any,
|
|
180
|
+
n: int,
|
|
181
|
+
seed: int = 42,
|
|
182
|
+
**kwargs: Any,
|
|
183
|
+
) -> Any:
|
|
184
|
+
"""Take first N rows."""
|
|
185
|
+
return df.head(n)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class TailSamplingStrategy(SamplingStrategy):
|
|
189
|
+
"""Tail sampling strategy - take last N rows."""
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def method(self) -> SamplingMethod:
|
|
193
|
+
return SamplingMethod.TAIL
|
|
194
|
+
|
|
195
|
+
def sample(
|
|
196
|
+
self,
|
|
197
|
+
df: Any,
|
|
198
|
+
n: int,
|
|
199
|
+
seed: int = 42,
|
|
200
|
+
**kwargs: Any,
|
|
201
|
+
) -> Any:
|
|
202
|
+
"""Take last N rows."""
|
|
203
|
+
return df.tail(n)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class StratifiedSamplingStrategy(SamplingStrategy):
|
|
207
|
+
"""Stratified sampling strategy by a categorical column."""
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def method(self) -> SamplingMethod:
|
|
211
|
+
return SamplingMethod.STRATIFIED
|
|
212
|
+
|
|
213
|
+
def sample(
|
|
214
|
+
self,
|
|
215
|
+
df: Any,
|
|
216
|
+
n: int,
|
|
217
|
+
seed: int = 42,
|
|
218
|
+
stratify_column: str | None = None,
|
|
219
|
+
**kwargs: Any,
|
|
220
|
+
) -> Any:
|
|
221
|
+
"""Perform stratified sampling.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
df: DataFrame to sample from.
|
|
225
|
+
n: Total number of rows to sample.
|
|
226
|
+
seed: Random seed.
|
|
227
|
+
stratify_column: Column to stratify by. If None, falls back to random.
|
|
228
|
+
**kwargs: Additional arguments.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Sampled DataFrame with proportional representation.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
if len(df) <= n:
|
|
235
|
+
return df
|
|
236
|
+
|
|
237
|
+
if stratify_column is None or stratify_column not in df.columns:
|
|
238
|
+
# Fall back to random sampling
|
|
239
|
+
return df.sample(n=n, seed=seed)
|
|
240
|
+
|
|
241
|
+
# Calculate proportion for each group
|
|
242
|
+
total_rows = len(df)
|
|
243
|
+
fraction = n / total_rows
|
|
244
|
+
|
|
245
|
+
# Sample proportionally from each group
|
|
246
|
+
sampled = df.group_by(stratify_column).map_groups(
|
|
247
|
+
lambda group: group.sample(
|
|
248
|
+
fraction=min(1.0, fraction * 1.1), # Slight oversample
|
|
249
|
+
seed=seed,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Trim to exact size if oversampled
|
|
254
|
+
if len(sampled) > n:
|
|
255
|
+
sampled = sampled.sample(n=n, seed=seed)
|
|
256
|
+
|
|
257
|
+
return sampled
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class DataSampler:
|
|
261
|
+
"""Main sampler class that coordinates sampling operations.
|
|
262
|
+
|
|
263
|
+
Provides a high-level interface for sampling large datasets
|
|
264
|
+
with automatic format detection and strategy selection.
|
|
265
|
+
|
|
266
|
+
Usage:
|
|
267
|
+
sampler = DataSampler()
|
|
268
|
+
result = await sampler.auto_sample("/path/to/large.csv")
|
|
269
|
+
# Use result.sampled_path for validation
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __init__(self, config: SamplingConfig | None = None) -> None:
|
|
273
|
+
"""Initialize data sampler.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
config: Sampling configuration. Uses defaults if not provided.
|
|
277
|
+
"""
|
|
278
|
+
self._config = config or SamplingConfig()
|
|
279
|
+
self._strategies: dict[SamplingMethod, SamplingStrategy] = {}
|
|
280
|
+
self._register_default_strategies()
|
|
281
|
+
|
|
282
|
+
# Set up temp directory
|
|
283
|
+
if self._config.temp_dir is None:
|
|
284
|
+
settings = get_settings()
|
|
285
|
+
self._config.temp_dir = settings.cache_dir / "samples"
|
|
286
|
+
self._config.temp_dir.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
|
|
288
|
+
def _register_default_strategies(self) -> None:
|
|
289
|
+
"""Register all default sampling strategies."""
|
|
290
|
+
self._strategies = {
|
|
291
|
+
SamplingMethod.RANDOM: RandomSamplingStrategy(),
|
|
292
|
+
SamplingMethod.HEAD: HeadSamplingStrategy(),
|
|
293
|
+
SamplingMethod.TAIL: TailSamplingStrategy(),
|
|
294
|
+
SamplingMethod.STRATIFIED: StratifiedSamplingStrategy(),
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
def register_strategy(self, strategy: SamplingStrategy) -> None:
|
|
298
|
+
"""Register a custom sampling strategy.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
strategy: Sampling strategy to register.
|
|
302
|
+
"""
|
|
303
|
+
self._strategies[strategy.method] = strategy
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def config(self) -> SamplingConfig:
|
|
307
|
+
"""Get sampling configuration."""
|
|
308
|
+
return self._config
|
|
309
|
+
|
|
310
|
+
def get_file_info(self, path: str | Path) -> dict[str, Any]:
|
|
311
|
+
"""Get file information for sampling decision.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
path: Path to data file.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Dictionary with file size, format, and estimated rows.
|
|
318
|
+
"""
|
|
319
|
+
path = Path(path)
|
|
320
|
+
|
|
321
|
+
if not path.exists():
|
|
322
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
323
|
+
|
|
324
|
+
size_bytes = path.stat().st_size
|
|
325
|
+
size_mb = size_bytes / (1024 * 1024)
|
|
326
|
+
|
|
327
|
+
# Detect format
|
|
328
|
+
suffix = path.suffix.lower()
|
|
329
|
+
format_map = {
|
|
330
|
+
".csv": "csv",
|
|
331
|
+
".parquet": "parquet",
|
|
332
|
+
".pq": "parquet",
|
|
333
|
+
".json": "json",
|
|
334
|
+
".jsonl": "jsonl",
|
|
335
|
+
".ndjson": "jsonl",
|
|
336
|
+
}
|
|
337
|
+
file_format = format_map.get(suffix, "unknown")
|
|
338
|
+
|
|
339
|
+
# Estimate row count for CSV (rough estimate based on average line size)
|
|
340
|
+
estimated_rows = None
|
|
341
|
+
if file_format == "csv" and size_mb > 0:
|
|
342
|
+
# Sample first 10KB to estimate average line length
|
|
343
|
+
with open(path, encoding="utf-8", errors="ignore") as f:
|
|
344
|
+
sample = f.read(10240)
|
|
345
|
+
lines = sample.count("\n")
|
|
346
|
+
if lines > 0:
|
|
347
|
+
avg_line_size = len(sample) / lines
|
|
348
|
+
estimated_rows = int(size_bytes / avg_line_size)
|
|
349
|
+
|
|
350
|
+
return {
|
|
351
|
+
"path": str(path),
|
|
352
|
+
"size_bytes": size_bytes,
|
|
353
|
+
"size_mb": round(size_mb, 2),
|
|
354
|
+
"format": file_format,
|
|
355
|
+
"estimated_rows": estimated_rows,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
def needs_sampling(self, path: str | Path) -> bool:
|
|
359
|
+
"""Check if a file needs sampling based on size.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
path: Path to data file.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
True if file exceeds size threshold.
|
|
366
|
+
"""
|
|
367
|
+
info = self.get_file_info(path)
|
|
368
|
+
return info["size_mb"] > self._config.size_threshold_mb
|
|
369
|
+
|
|
370
|
+
def _load_dataframe(self, path: str | Path) -> Any:
|
|
371
|
+
"""Load data file into polars DataFrame.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
path: Path to data file.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Polars DataFrame.
|
|
378
|
+
"""
|
|
379
|
+
import polars as pl
|
|
380
|
+
|
|
381
|
+
path = Path(path)
|
|
382
|
+
suffix = path.suffix.lower()
|
|
383
|
+
|
|
384
|
+
if suffix == ".csv":
|
|
385
|
+
return pl.read_csv(path, infer_schema_length=10000)
|
|
386
|
+
elif suffix in (".parquet", ".pq"):
|
|
387
|
+
return pl.read_parquet(path)
|
|
388
|
+
elif suffix == ".json":
|
|
389
|
+
return pl.read_json(path)
|
|
390
|
+
elif suffix in (".jsonl", ".ndjson"):
|
|
391
|
+
return pl.read_ndjson(path)
|
|
392
|
+
else:
|
|
393
|
+
# Try CSV as fallback
|
|
394
|
+
logger.warning(f"Unknown format {suffix}, trying CSV")
|
|
395
|
+
return pl.read_csv(path, infer_schema_length=10000)
|
|
396
|
+
|
|
397
|
+
def _save_dataframe(self, df: Any, path: Path, original_format: str) -> None:
|
|
398
|
+
"""Save DataFrame to file in specified format.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
df: Polars DataFrame to save.
|
|
402
|
+
path: Output path.
|
|
403
|
+
original_format: Original file format.
|
|
404
|
+
"""
|
|
405
|
+
if original_format in ("parquet", "pq"):
|
|
406
|
+
df.write_parquet(path)
|
|
407
|
+
elif original_format in ("json",):
|
|
408
|
+
df.write_json(path)
|
|
409
|
+
elif original_format in ("jsonl", "ndjson"):
|
|
410
|
+
df.write_ndjson(path)
|
|
411
|
+
else:
|
|
412
|
+
# Default to CSV
|
|
413
|
+
df.write_csv(path)
|
|
414
|
+
|
|
415
|
+
def _generate_sample_path(self, original_path: Path) -> Path:
|
|
416
|
+
"""Generate a unique path for the sampled file.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
original_path: Path to original file.
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
Path for sampled file in temp directory.
|
|
423
|
+
"""
|
|
424
|
+
# Create hash of original path for uniqueness
|
|
425
|
+
path_hash = hashlib.md5(str(original_path).encode()).hexdigest()[:12]
|
|
426
|
+
suffix = original_path.suffix
|
|
427
|
+
|
|
428
|
+
# Use parquet for efficiency if original was CSV/JSON
|
|
429
|
+
if suffix in (".csv", ".json", ".jsonl", ".ndjson"):
|
|
430
|
+
suffix = ".parquet"
|
|
431
|
+
|
|
432
|
+
return self._config.temp_dir / f"sample_{path_hash}{suffix}"
|
|
433
|
+
|
|
434
|
+
async def sample(
|
|
435
|
+
self,
|
|
436
|
+
path: str | Path,
|
|
437
|
+
n: int | None = None,
|
|
438
|
+
method: SamplingMethod | None = None,
|
|
439
|
+
**kwargs: Any,
|
|
440
|
+
) -> SamplingResult:
|
|
441
|
+
"""Sample data from a file.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
path: Path to data file.
|
|
445
|
+
n: Number of rows to sample. Uses config default if not provided.
|
|
446
|
+
method: Sampling method. Uses config default if not provided.
|
|
447
|
+
**kwargs: Additional arguments for specific strategies.
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
SamplingResult with paths and statistics.
|
|
451
|
+
"""
|
|
452
|
+
path = Path(path)
|
|
453
|
+
n = n or self._config.default_sample_size
|
|
454
|
+
method = method or self._config.method
|
|
455
|
+
|
|
456
|
+
# Get strategy
|
|
457
|
+
strategy = self._strategies.get(method)
|
|
458
|
+
if strategy is None:
|
|
459
|
+
raise ValueError(f"Unknown sampling method: {method}")
|
|
460
|
+
|
|
461
|
+
# Run sampling in executor to avoid blocking
|
|
462
|
+
loop = asyncio.get_event_loop()
|
|
463
|
+
result = await loop.run_in_executor(
|
|
464
|
+
None,
|
|
465
|
+
self._sample_sync,
|
|
466
|
+
path,
|
|
467
|
+
n,
|
|
468
|
+
strategy,
|
|
469
|
+
kwargs,
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
return result
|
|
473
|
+
|
|
474
|
+
def _sample_sync(
|
|
475
|
+
self,
|
|
476
|
+
path: Path,
|
|
477
|
+
n: int,
|
|
478
|
+
strategy: SamplingStrategy,
|
|
479
|
+
kwargs: dict[str, Any],
|
|
480
|
+
) -> SamplingResult:
|
|
481
|
+
"""Synchronous sampling implementation.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
path: Path to data file.
|
|
485
|
+
n: Number of rows to sample.
|
|
486
|
+
strategy: Sampling strategy to use.
|
|
487
|
+
kwargs: Additional strategy arguments.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
SamplingResult with sampling details.
|
|
491
|
+
"""
|
|
492
|
+
file_info = self.get_file_info(path)
|
|
493
|
+
|
|
494
|
+
# Load data
|
|
495
|
+
logger.info(f"Loading {path} for sampling ({file_info['size_mb']:.1f} MB)")
|
|
496
|
+
df = self._load_dataframe(path)
|
|
497
|
+
original_rows = len(df)
|
|
498
|
+
|
|
499
|
+
# Check if sampling is actually needed
|
|
500
|
+
if original_rows <= n:
|
|
501
|
+
logger.info(f"File has {original_rows} rows, no sampling needed")
|
|
502
|
+
return SamplingResult(
|
|
503
|
+
original_path=str(path),
|
|
504
|
+
sampled_path=str(path),
|
|
505
|
+
was_sampled=False,
|
|
506
|
+
original_rows=original_rows,
|
|
507
|
+
sampled_rows=original_rows,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Perform sampling
|
|
511
|
+
logger.info(
|
|
512
|
+
f"Sampling {n} rows from {original_rows} using {strategy.method.value}"
|
|
513
|
+
)
|
|
514
|
+
sampled_df = strategy.sample(
|
|
515
|
+
df,
|
|
516
|
+
n=n,
|
|
517
|
+
seed=self._config.seed,
|
|
518
|
+
**kwargs,
|
|
519
|
+
)
|
|
520
|
+
sampled_rows = len(sampled_df)
|
|
521
|
+
|
|
522
|
+
# Save sampled data
|
|
523
|
+
sample_path = self._generate_sample_path(path)
|
|
524
|
+
self._save_dataframe(sampled_df, sample_path, file_info["format"])
|
|
525
|
+
|
|
526
|
+
# Calculate size reduction
|
|
527
|
+
sampled_size = sample_path.stat().st_size
|
|
528
|
+
size_reduction = (1 - sampled_size / file_info["size_bytes"]) * 100
|
|
529
|
+
|
|
530
|
+
logger.info(
|
|
531
|
+
f"Sampling complete: {original_rows} -> {sampled_rows} rows "
|
|
532
|
+
f"({size_reduction:.1f}% size reduction)"
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
return SamplingResult(
|
|
536
|
+
original_path=str(path),
|
|
537
|
+
sampled_path=str(sample_path),
|
|
538
|
+
was_sampled=True,
|
|
539
|
+
original_rows=original_rows,
|
|
540
|
+
sampled_rows=sampled_rows,
|
|
541
|
+
method=strategy.method,
|
|
542
|
+
size_reduction_pct=size_reduction,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
async def auto_sample(
|
|
546
|
+
self,
|
|
547
|
+
path: str | Path,
|
|
548
|
+
n: int | None = None,
|
|
549
|
+
method: SamplingMethod | None = None,
|
|
550
|
+
**kwargs: Any,
|
|
551
|
+
) -> SamplingResult:
|
|
552
|
+
"""Automatically sample if needed based on file size.
|
|
553
|
+
|
|
554
|
+
This is the recommended entry point for most use cases.
|
|
555
|
+
It checks file size and only samples if threshold is exceeded.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
path: Path to data file.
|
|
559
|
+
n: Number of rows to sample if needed.
|
|
560
|
+
method: Sampling method if sampling is needed.
|
|
561
|
+
**kwargs: Additional strategy arguments.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
SamplingResult (was_sampled=False if no sampling needed).
|
|
565
|
+
"""
|
|
566
|
+
path = Path(path)
|
|
567
|
+
|
|
568
|
+
if not self.needs_sampling(path):
|
|
569
|
+
# No sampling needed
|
|
570
|
+
return SamplingResult(
|
|
571
|
+
original_path=str(path),
|
|
572
|
+
sampled_path=str(path),
|
|
573
|
+
was_sampled=False,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return await self.sample(path, n=n, method=method, **kwargs)
|
|
577
|
+
|
|
578
|
+
async def cleanup_old_samples(self, max_age_hours: int | None = None) -> int:
|
|
579
|
+
"""Clean up old sample files.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
max_age_hours: Maximum age in hours. Uses config default if not provided.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
Number of files cleaned up.
|
|
586
|
+
"""
|
|
587
|
+
import time
|
|
588
|
+
|
|
589
|
+
max_age_hours = max_age_hours or self._config.cleanup_after_hours
|
|
590
|
+
max_age_seconds = max_age_hours * 3600
|
|
591
|
+
now = time.time()
|
|
592
|
+
|
|
593
|
+
cleaned = 0
|
|
594
|
+
for sample_file in self._config.temp_dir.glob("sample_*"):
|
|
595
|
+
if sample_file.is_file():
|
|
596
|
+
age = now - sample_file.stat().st_mtime
|
|
597
|
+
if age > max_age_seconds:
|
|
598
|
+
sample_file.unlink()
|
|
599
|
+
cleaned += 1
|
|
600
|
+
|
|
601
|
+
if cleaned > 0:
|
|
602
|
+
logger.info(f"Cleaned up {cleaned} old sample files")
|
|
603
|
+
|
|
604
|
+
return cleaned
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
# Singleton instance
|
|
608
|
+
_sampler: DataSampler | None = None
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def get_sampler() -> DataSampler:
|
|
612
|
+
"""Get sampler singleton.
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
DataSampler instance.
|
|
616
|
+
"""
|
|
617
|
+
global _sampler
|
|
618
|
+
if _sampler is None:
|
|
619
|
+
_sampler = DataSampler()
|
|
620
|
+
return _sampler
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def reset_sampler() -> None:
|
|
624
|
+
"""Reset sampler singleton (for testing)."""
|
|
625
|
+
global _sampler
|
|
626
|
+
_sampler = None
|