truthound-dashboard 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/__init__.py +11 -0
- truthound_dashboard/__main__.py +6 -0
- truthound_dashboard/api/__init__.py +15 -0
- truthound_dashboard/api/deps.py +153 -0
- truthound_dashboard/api/drift.py +179 -0
- truthound_dashboard/api/error_handlers.py +287 -0
- truthound_dashboard/api/health.py +78 -0
- truthound_dashboard/api/history.py +62 -0
- truthound_dashboard/api/middleware.py +626 -0
- truthound_dashboard/api/notifications.py +561 -0
- truthound_dashboard/api/profile.py +52 -0
- truthound_dashboard/api/router.py +83 -0
- truthound_dashboard/api/rules.py +277 -0
- truthound_dashboard/api/schedules.py +329 -0
- truthound_dashboard/api/schemas.py +136 -0
- truthound_dashboard/api/sources.py +229 -0
- truthound_dashboard/api/validations.py +125 -0
- truthound_dashboard/cli.py +226 -0
- truthound_dashboard/config.py +132 -0
- truthound_dashboard/core/__init__.py +264 -0
- truthound_dashboard/core/base.py +185 -0
- truthound_dashboard/core/cache.py +479 -0
- truthound_dashboard/core/connections.py +331 -0
- truthound_dashboard/core/encryption.py +409 -0
- truthound_dashboard/core/exceptions.py +627 -0
- truthound_dashboard/core/logging.py +488 -0
- truthound_dashboard/core/maintenance.py +542 -0
- truthound_dashboard/core/notifications/__init__.py +56 -0
- truthound_dashboard/core/notifications/base.py +390 -0
- truthound_dashboard/core/notifications/channels.py +557 -0
- truthound_dashboard/core/notifications/dispatcher.py +453 -0
- truthound_dashboard/core/notifications/events.py +155 -0
- truthound_dashboard/core/notifications/service.py +744 -0
- truthound_dashboard/core/sampling.py +626 -0
- truthound_dashboard/core/scheduler.py +311 -0
- truthound_dashboard/core/services.py +1531 -0
- truthound_dashboard/core/truthound_adapter.py +659 -0
- truthound_dashboard/db/__init__.py +67 -0
- truthound_dashboard/db/base.py +108 -0
- truthound_dashboard/db/database.py +196 -0
- truthound_dashboard/db/models.py +732 -0
- truthound_dashboard/db/repository.py +237 -0
- truthound_dashboard/main.py +309 -0
- truthound_dashboard/schemas/__init__.py +150 -0
- truthound_dashboard/schemas/base.py +96 -0
- truthound_dashboard/schemas/drift.py +118 -0
- truthound_dashboard/schemas/history.py +74 -0
- truthound_dashboard/schemas/profile.py +91 -0
- truthound_dashboard/schemas/rule.py +199 -0
- truthound_dashboard/schemas/schedule.py +88 -0
- truthound_dashboard/schemas/schema.py +121 -0
- truthound_dashboard/schemas/source.py +138 -0
- truthound_dashboard/schemas/validation.py +192 -0
- truthound_dashboard/static/assets/index-BqJMyAHX.js +110 -0
- truthound_dashboard/static/assets/index-DMDxHCTs.js +465 -0
- truthound_dashboard/static/assets/index-Dm2D11TK.css +1 -0
- truthound_dashboard/static/index.html +15 -0
- truthound_dashboard/static/mockServiceWorker.js +349 -0
- truthound_dashboard-1.0.0.dist-info/METADATA +218 -0
- truthound_dashboard-1.0.0.dist-info/RECORD +62 -0
- truthound_dashboard-1.0.0.dist-info/WHEEL +4 -0
- truthound_dashboard-1.0.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,659 @@
|
|
|
1
|
+
"""Async wrapper for truthound package.
|
|
2
|
+
|
|
3
|
+
This module provides an async interface to truthound functions,
|
|
4
|
+
enabling non-blocking validation operations in the FastAPI application.
|
|
5
|
+
|
|
6
|
+
The adapter uses ThreadPoolExecutor to run synchronous truthound
|
|
7
|
+
functions without blocking the async event loop.
|
|
8
|
+
|
|
9
|
+
Features:
|
|
10
|
+
- Async wrappers for all truthound functions
|
|
11
|
+
- Automatic sampling for large datasets (100MB+ files)
|
|
12
|
+
- Configurable sample size and sampling methods
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
adapter = get_adapter()
|
|
16
|
+
result = await adapter.check("/path/to/data.csv")
|
|
17
|
+
schema = await adapter.learn("/path/to/data.csv")
|
|
18
|
+
|
|
19
|
+
# With auto-sampling for large files
|
|
20
|
+
result = await adapter.check_with_sampling("/path/to/large.csv")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
26
|
+
import logging
|
|
27
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from functools import partial
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Protocol, runtime_checkable
|
|
32
|
+
|
|
33
|
+
import yaml
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@runtime_checkable
|
|
39
|
+
class TruthoundResult(Protocol):
|
|
40
|
+
"""Protocol for truthound result objects."""
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def issues(self) -> list[Any]: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CheckResult:
|
|
48
|
+
"""Validation check result.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
passed: Whether validation passed (no issues).
|
|
52
|
+
has_critical: Whether critical issues were found.
|
|
53
|
+
has_high: Whether high severity issues were found.
|
|
54
|
+
total_issues: Total number of issues.
|
|
55
|
+
critical_issues: Number of critical issues.
|
|
56
|
+
high_issues: Number of high severity issues.
|
|
57
|
+
medium_issues: Number of medium severity issues.
|
|
58
|
+
low_issues: Number of low severity issues.
|
|
59
|
+
source: Data source path.
|
|
60
|
+
row_count: Number of rows validated.
|
|
61
|
+
column_count: Number of columns.
|
|
62
|
+
issues: List of validation issues.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
passed: bool
|
|
66
|
+
has_critical: bool
|
|
67
|
+
has_high: bool
|
|
68
|
+
total_issues: int
|
|
69
|
+
critical_issues: int
|
|
70
|
+
high_issues: int
|
|
71
|
+
medium_issues: int
|
|
72
|
+
low_issues: int
|
|
73
|
+
source: str
|
|
74
|
+
row_count: int
|
|
75
|
+
column_count: int
|
|
76
|
+
issues: list[dict[str, Any]]
|
|
77
|
+
|
|
78
|
+
def to_dict(self) -> dict[str, Any]:
|
|
79
|
+
"""Convert to dictionary."""
|
|
80
|
+
return {
|
|
81
|
+
"passed": self.passed,
|
|
82
|
+
"has_critical": self.has_critical,
|
|
83
|
+
"has_high": self.has_high,
|
|
84
|
+
"total_issues": self.total_issues,
|
|
85
|
+
"critical_issues": self.critical_issues,
|
|
86
|
+
"high_issues": self.high_issues,
|
|
87
|
+
"medium_issues": self.medium_issues,
|
|
88
|
+
"low_issues": self.low_issues,
|
|
89
|
+
"source": self.source,
|
|
90
|
+
"row_count": self.row_count,
|
|
91
|
+
"column_count": self.column_count,
|
|
92
|
+
"issues": self.issues,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class LearnResult:
|
|
98
|
+
"""Schema learning result.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
schema: Schema as dictionary.
|
|
102
|
+
schema_yaml: Schema as YAML string.
|
|
103
|
+
row_count: Number of rows analyzed.
|
|
104
|
+
column_count: Number of columns.
|
|
105
|
+
columns: List of column names.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
schema: dict[str, Any]
|
|
109
|
+
schema_yaml: str
|
|
110
|
+
row_count: int | None
|
|
111
|
+
column_count: int
|
|
112
|
+
columns: list[str]
|
|
113
|
+
|
|
114
|
+
def to_dict(self) -> dict[str, Any]:
|
|
115
|
+
"""Convert to dictionary."""
|
|
116
|
+
return {
|
|
117
|
+
"schema": self.schema,
|
|
118
|
+
"schema_yaml": self.schema_yaml,
|
|
119
|
+
"row_count": self.row_count,
|
|
120
|
+
"column_count": self.column_count,
|
|
121
|
+
"columns": self.columns,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class ProfileResult:
|
|
127
|
+
"""Data profiling result.
|
|
128
|
+
|
|
129
|
+
Attributes:
|
|
130
|
+
source: Data source path.
|
|
131
|
+
row_count: Number of rows.
|
|
132
|
+
column_count: Number of columns.
|
|
133
|
+
size_bytes: Data size in bytes.
|
|
134
|
+
columns: List of column profile dictionaries.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
source: str
|
|
138
|
+
row_count: int
|
|
139
|
+
column_count: int
|
|
140
|
+
size_bytes: int
|
|
141
|
+
columns: list[dict[str, Any]]
|
|
142
|
+
|
|
143
|
+
def to_dict(self) -> dict[str, Any]:
|
|
144
|
+
"""Convert to dictionary."""
|
|
145
|
+
return {
|
|
146
|
+
"source": self.source,
|
|
147
|
+
"row_count": self.row_count,
|
|
148
|
+
"column_count": self.column_count,
|
|
149
|
+
"size_bytes": self.size_bytes,
|
|
150
|
+
"columns": self.columns,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class CompareResult:
|
|
156
|
+
"""Drift comparison result.
|
|
157
|
+
|
|
158
|
+
Attributes:
|
|
159
|
+
baseline_source: Baseline data source path.
|
|
160
|
+
current_source: Current data source path.
|
|
161
|
+
baseline_rows: Number of rows in baseline.
|
|
162
|
+
current_rows: Number of rows in current.
|
|
163
|
+
has_drift: Whether drift was detected.
|
|
164
|
+
has_high_drift: Whether high-severity drift was detected.
|
|
165
|
+
total_columns: Total columns compared.
|
|
166
|
+
drifted_columns: List of column names with drift.
|
|
167
|
+
columns: Per-column drift results.
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
baseline_source: str
|
|
171
|
+
current_source: str
|
|
172
|
+
baseline_rows: int
|
|
173
|
+
current_rows: int
|
|
174
|
+
has_drift: bool
|
|
175
|
+
has_high_drift: bool
|
|
176
|
+
total_columns: int
|
|
177
|
+
drifted_columns: list[str]
|
|
178
|
+
columns: list[dict[str, Any]]
|
|
179
|
+
|
|
180
|
+
def to_dict(self) -> dict[str, Any]:
|
|
181
|
+
"""Convert to dictionary."""
|
|
182
|
+
return {
|
|
183
|
+
"baseline_source": self.baseline_source,
|
|
184
|
+
"current_source": self.current_source,
|
|
185
|
+
"baseline_rows": self.baseline_rows,
|
|
186
|
+
"current_rows": self.current_rows,
|
|
187
|
+
"has_drift": self.has_drift,
|
|
188
|
+
"has_high_drift": self.has_high_drift,
|
|
189
|
+
"total_columns": self.total_columns,
|
|
190
|
+
"drifted_columns": self.drifted_columns,
|
|
191
|
+
"columns": self.columns,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class TruthoundAdapter:
|
|
196
|
+
"""Async wrapper for truthound functions.
|
|
197
|
+
|
|
198
|
+
This adapter provides an async interface to truthound operations,
|
|
199
|
+
running them in a thread pool to avoid blocking the event loop.
|
|
200
|
+
|
|
201
|
+
Attributes:
|
|
202
|
+
max_workers: Maximum number of worker threads.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
def __init__(self, max_workers: int = 4) -> None:
|
|
206
|
+
"""Initialize adapter.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
max_workers: Maximum worker threads for concurrent operations.
|
|
210
|
+
"""
|
|
211
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
212
|
+
self._max_workers = max_workers
|
|
213
|
+
|
|
214
|
+
async def check(
|
|
215
|
+
self,
|
|
216
|
+
data: str,
|
|
217
|
+
*,
|
|
218
|
+
validators: list[str] | None = None,
|
|
219
|
+
schema: str | None = None,
|
|
220
|
+
auto_schema: bool = False,
|
|
221
|
+
parallel: bool = False,
|
|
222
|
+
) -> CheckResult:
|
|
223
|
+
"""Run data validation asynchronously.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
data: Data source path (CSV, Parquet, etc.).
|
|
227
|
+
validators: Optional list of validator names to run.
|
|
228
|
+
schema: Optional path to schema YAML file.
|
|
229
|
+
auto_schema: If True, auto-learns schema for validation.
|
|
230
|
+
parallel: If True, uses parallel execution.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
CheckResult with validation results.
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ImportError: If truthound is not installed.
|
|
237
|
+
FileNotFoundError: If data file doesn't exist.
|
|
238
|
+
"""
|
|
239
|
+
import truthound as th
|
|
240
|
+
|
|
241
|
+
func = partial(
|
|
242
|
+
th.check,
|
|
243
|
+
data,
|
|
244
|
+
validators=validators,
|
|
245
|
+
schema=schema,
|
|
246
|
+
auto_schema=auto_schema,
|
|
247
|
+
parallel=parallel,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
loop = asyncio.get_event_loop()
|
|
251
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
252
|
+
|
|
253
|
+
return self._convert_check_result(result)
|
|
254
|
+
|
|
255
|
+
async def learn(
|
|
256
|
+
self,
|
|
257
|
+
source: str,
|
|
258
|
+
*,
|
|
259
|
+
infer_constraints: bool = True,
|
|
260
|
+
) -> LearnResult:
|
|
261
|
+
"""Learn schema from data asynchronously.
|
|
262
|
+
|
|
263
|
+
Uses truthound's th.learn() to analyze data and generate schema.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
source: Data source path.
|
|
267
|
+
infer_constraints: If True, infer constraints from statistics.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
LearnResult with schema information.
|
|
271
|
+
"""
|
|
272
|
+
import truthound as th
|
|
273
|
+
|
|
274
|
+
func = partial(th.learn, source, infer_constraints=infer_constraints)
|
|
275
|
+
|
|
276
|
+
loop = asyncio.get_event_loop()
|
|
277
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
278
|
+
|
|
279
|
+
return self._convert_learn_result(result)
|
|
280
|
+
|
|
281
|
+
async def profile(self, source: str) -> ProfileResult:
|
|
282
|
+
"""Run data profiling asynchronously.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
source: Data source path.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
ProfileResult with profiling information.
|
|
289
|
+
"""
|
|
290
|
+
import truthound as th
|
|
291
|
+
|
|
292
|
+
func = partial(th.profile, source)
|
|
293
|
+
|
|
294
|
+
loop = asyncio.get_event_loop()
|
|
295
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
296
|
+
|
|
297
|
+
return self._convert_profile_result(result)
|
|
298
|
+
|
|
299
|
+
async def compare(
|
|
300
|
+
self,
|
|
301
|
+
baseline: str,
|
|
302
|
+
current: str,
|
|
303
|
+
*,
|
|
304
|
+
columns: list[str] | None = None,
|
|
305
|
+
method: str = "auto",
|
|
306
|
+
threshold: float | None = None,
|
|
307
|
+
sample_size: int | None = None,
|
|
308
|
+
) -> CompareResult:
|
|
309
|
+
"""Compare two datasets for drift detection.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
baseline: Reference data path.
|
|
313
|
+
current: Current data path to compare.
|
|
314
|
+
columns: Optional list of columns to compare. If None, all common columns.
|
|
315
|
+
method: Detection method - "auto", "ks", "psi", "chi2", or "js".
|
|
316
|
+
threshold: Optional custom threshold for drift detection.
|
|
317
|
+
sample_size: Optional sample size for large datasets.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
CompareResult with drift detection results.
|
|
321
|
+
"""
|
|
322
|
+
import truthound as th
|
|
323
|
+
|
|
324
|
+
func = partial(
|
|
325
|
+
th.compare,
|
|
326
|
+
baseline,
|
|
327
|
+
current,
|
|
328
|
+
columns=columns,
|
|
329
|
+
method=method,
|
|
330
|
+
threshold=threshold,
|
|
331
|
+
sample_size=sample_size,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
loop = asyncio.get_event_loop()
|
|
335
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
336
|
+
|
|
337
|
+
return self._convert_compare_result(result)
|
|
338
|
+
|
|
339
|
+
async def check_with_sampling(
|
|
340
|
+
self,
|
|
341
|
+
data: str,
|
|
342
|
+
*,
|
|
343
|
+
validators: list[str] | None = None,
|
|
344
|
+
schema: str | None = None,
|
|
345
|
+
auto_schema: bool = False,
|
|
346
|
+
parallel: bool = False,
|
|
347
|
+
sample_size: int | None = None,
|
|
348
|
+
sampling_method: str | None = None,
|
|
349
|
+
) -> CheckResult:
|
|
350
|
+
"""Run data validation with automatic sampling for large datasets.
|
|
351
|
+
|
|
352
|
+
This method automatically samples large files (>100MB by default)
|
|
353
|
+
before running validation, which significantly improves performance
|
|
354
|
+
while maintaining validation accuracy for most use cases.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
data: Data source path (CSV, Parquet, etc.).
|
|
358
|
+
validators: Optional list of validator names to run.
|
|
359
|
+
schema: Optional path to schema YAML file.
|
|
360
|
+
auto_schema: If True, auto-learns schema for validation.
|
|
361
|
+
parallel: If True, uses parallel execution.
|
|
362
|
+
sample_size: Number of rows to sample. Uses config default if not specified.
|
|
363
|
+
sampling_method: Sampling method ("random", "head", "stratified").
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
CheckResult with validation results.
|
|
367
|
+
|
|
368
|
+
Note:
|
|
369
|
+
The result.row_count reflects the sampled row count when sampling
|
|
370
|
+
was performed. Check the sampling metadata for original row count.
|
|
371
|
+
"""
|
|
372
|
+
from truthound_dashboard.core.sampling import SamplingMethod, get_sampler
|
|
373
|
+
|
|
374
|
+
sampler = get_sampler()
|
|
375
|
+
|
|
376
|
+
# Check if sampling is needed and perform if so
|
|
377
|
+
path = Path(data)
|
|
378
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
379
|
+
# Determine sampling method
|
|
380
|
+
method = None
|
|
381
|
+
if sampling_method:
|
|
382
|
+
try:
|
|
383
|
+
method = SamplingMethod(sampling_method)
|
|
384
|
+
except ValueError:
|
|
385
|
+
logger.warning(f"Unknown sampling method: {sampling_method}")
|
|
386
|
+
|
|
387
|
+
# Perform sampling
|
|
388
|
+
sample_result = await sampler.auto_sample(
|
|
389
|
+
path,
|
|
390
|
+
n=sample_size,
|
|
391
|
+
method=method,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
if sample_result.was_sampled:
|
|
395
|
+
logger.info(
|
|
396
|
+
f"Sampled {sample_result.sampled_rows} rows from "
|
|
397
|
+
f"{sample_result.original_rows} ({sample_result.size_reduction_pct:.1f}% reduction)"
|
|
398
|
+
)
|
|
399
|
+
data = sample_result.sampled_path
|
|
400
|
+
|
|
401
|
+
# Run validation on (possibly sampled) data
|
|
402
|
+
return await self.check(
|
|
403
|
+
data,
|
|
404
|
+
validators=validators,
|
|
405
|
+
schema=schema,
|
|
406
|
+
auto_schema=auto_schema,
|
|
407
|
+
parallel=parallel,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
async def learn_with_sampling(
|
|
411
|
+
self,
|
|
412
|
+
source: str,
|
|
413
|
+
*,
|
|
414
|
+
infer_constraints: bool = True,
|
|
415
|
+
sample_size: int | None = None,
|
|
416
|
+
) -> LearnResult:
|
|
417
|
+
"""Learn schema from data with automatic sampling for large datasets.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
source: Data source path.
|
|
421
|
+
infer_constraints: If True, infer constraints from statistics.
|
|
422
|
+
sample_size: Number of rows to sample. Uses config default if not specified.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
LearnResult with schema information.
|
|
426
|
+
"""
|
|
427
|
+
from truthound_dashboard.core.sampling import get_sampler
|
|
428
|
+
|
|
429
|
+
sampler = get_sampler()
|
|
430
|
+
|
|
431
|
+
# Sample if needed
|
|
432
|
+
path = Path(source)
|
|
433
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
434
|
+
sample_result = await sampler.auto_sample(path, n=sample_size)
|
|
435
|
+
if sample_result.was_sampled:
|
|
436
|
+
logger.info(
|
|
437
|
+
f"Sampled {sample_result.sampled_rows} rows for schema learning"
|
|
438
|
+
)
|
|
439
|
+
source = sample_result.sampled_path
|
|
440
|
+
|
|
441
|
+
return await self.learn(source, infer_constraints=infer_constraints)
|
|
442
|
+
|
|
443
|
+
async def profile_with_sampling(
|
|
444
|
+
self,
|
|
445
|
+
source: str,
|
|
446
|
+
*,
|
|
447
|
+
sample_size: int | None = None,
|
|
448
|
+
) -> ProfileResult:
|
|
449
|
+
"""Run data profiling with automatic sampling for large datasets.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
source: Data source path.
|
|
453
|
+
sample_size: Number of rows to sample. Uses config default if not specified.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
ProfileResult with profiling information.
|
|
457
|
+
"""
|
|
458
|
+
from truthound_dashboard.core.sampling import get_sampler
|
|
459
|
+
|
|
460
|
+
sampler = get_sampler()
|
|
461
|
+
|
|
462
|
+
# Sample if needed
|
|
463
|
+
path = Path(source)
|
|
464
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
465
|
+
sample_result = await sampler.auto_sample(path, n=sample_size)
|
|
466
|
+
if sample_result.was_sampled:
|
|
467
|
+
logger.info(
|
|
468
|
+
f"Sampled {sample_result.sampled_rows} rows for profiling"
|
|
469
|
+
)
|
|
470
|
+
source = sample_result.sampled_path
|
|
471
|
+
|
|
472
|
+
return await self.profile(source)
|
|
473
|
+
|
|
474
|
+
def _convert_check_result(self, result: Any) -> CheckResult:
|
|
475
|
+
"""Convert truthound Report to CheckResult.
|
|
476
|
+
|
|
477
|
+
The truthound Report contains:
|
|
478
|
+
- issues: list[ValidationIssue]
|
|
479
|
+
- source: str
|
|
480
|
+
- row_count: int
|
|
481
|
+
- column_count: int
|
|
482
|
+
- has_issues: bool
|
|
483
|
+
- has_critical: bool
|
|
484
|
+
- has_high: bool
|
|
485
|
+
"""
|
|
486
|
+
issues = result.issues
|
|
487
|
+
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
|
488
|
+
|
|
489
|
+
for issue in issues:
|
|
490
|
+
severity = issue.severity.value.lower()
|
|
491
|
+
if severity in severity_counts:
|
|
492
|
+
severity_counts[severity] += 1
|
|
493
|
+
|
|
494
|
+
converted_issues = [
|
|
495
|
+
{
|
|
496
|
+
"column": issue.column,
|
|
497
|
+
"issue_type": issue.issue_type,
|
|
498
|
+
"count": issue.count,
|
|
499
|
+
"severity": issue.severity.value,
|
|
500
|
+
"details": getattr(issue, "details", None),
|
|
501
|
+
"expected": getattr(issue, "expected", None),
|
|
502
|
+
"actual": getattr(issue, "actual", None),
|
|
503
|
+
}
|
|
504
|
+
for issue in issues
|
|
505
|
+
]
|
|
506
|
+
|
|
507
|
+
return CheckResult(
|
|
508
|
+
passed=not result.has_issues,
|
|
509
|
+
has_critical=result.has_critical,
|
|
510
|
+
has_high=result.has_high,
|
|
511
|
+
total_issues=len(issues),
|
|
512
|
+
critical_issues=severity_counts["critical"],
|
|
513
|
+
high_issues=severity_counts["high"],
|
|
514
|
+
medium_issues=severity_counts["medium"],
|
|
515
|
+
low_issues=severity_counts["low"],
|
|
516
|
+
source=result.source,
|
|
517
|
+
row_count=result.row_count,
|
|
518
|
+
column_count=result.column_count,
|
|
519
|
+
issues=converted_issues,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
def _convert_learn_result(self, result: Any) -> LearnResult:
|
|
523
|
+
"""Convert truthound Schema to LearnResult.
|
|
524
|
+
|
|
525
|
+
The truthound Schema contains:
|
|
526
|
+
- columns: dict[str, ColumnSchema]
|
|
527
|
+
- row_count: int | None
|
|
528
|
+
- version: str
|
|
529
|
+
- to_dict(): Convert to dictionary
|
|
530
|
+
"""
|
|
531
|
+
schema_dict = result.to_dict()
|
|
532
|
+
schema_yaml = yaml.dump(
|
|
533
|
+
schema_dict,
|
|
534
|
+
default_flow_style=False,
|
|
535
|
+
sort_keys=False,
|
|
536
|
+
allow_unicode=True,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
return LearnResult(
|
|
540
|
+
schema=schema_dict,
|
|
541
|
+
schema_yaml=schema_yaml,
|
|
542
|
+
row_count=result.row_count,
|
|
543
|
+
column_count=len(result.columns),
|
|
544
|
+
columns=list(result.columns.keys()),
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def _convert_profile_result(self, result: Any) -> ProfileResult:
|
|
548
|
+
"""Convert truthound ProfileReport to ProfileResult.
|
|
549
|
+
|
|
550
|
+
The truthound ProfileReport contains:
|
|
551
|
+
- source: str
|
|
552
|
+
- row_count: int
|
|
553
|
+
- column_count: int
|
|
554
|
+
- size_bytes: int
|
|
555
|
+
- columns: list[dict]
|
|
556
|
+
"""
|
|
557
|
+
columns = [
|
|
558
|
+
{
|
|
559
|
+
"name": col["name"],
|
|
560
|
+
"dtype": col["dtype"],
|
|
561
|
+
"null_pct": col.get("null_pct", "0%"),
|
|
562
|
+
"unique_pct": col.get("unique_pct", "0%"),
|
|
563
|
+
"min": col.get("min"),
|
|
564
|
+
"max": col.get("max"),
|
|
565
|
+
"mean": col.get("mean"),
|
|
566
|
+
"std": col.get("std"),
|
|
567
|
+
}
|
|
568
|
+
for col in result.columns
|
|
569
|
+
]
|
|
570
|
+
|
|
571
|
+
return ProfileResult(
|
|
572
|
+
source=result.source,
|
|
573
|
+
row_count=result.row_count,
|
|
574
|
+
column_count=result.column_count,
|
|
575
|
+
size_bytes=result.size_bytes,
|
|
576
|
+
columns=columns,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
def _convert_compare_result(self, result: Any) -> CompareResult:
|
|
580
|
+
"""Convert truthound DriftReport to CompareResult.
|
|
581
|
+
|
|
582
|
+
The truthound DriftReport contains:
|
|
583
|
+
- baseline_source: str
|
|
584
|
+
- current_source: str
|
|
585
|
+
- baseline_rows: int
|
|
586
|
+
- current_rows: int
|
|
587
|
+
- columns: list[ColumnDrift]
|
|
588
|
+
- has_drift: bool
|
|
589
|
+
- has_high_drift: bool
|
|
590
|
+
- get_drifted_columns(): list[str]
|
|
591
|
+
|
|
592
|
+
Each ColumnDrift has:
|
|
593
|
+
- column: str
|
|
594
|
+
- dtype: str
|
|
595
|
+
- result: DriftResult (drifted, level, method, statistic, p_value)
|
|
596
|
+
- baseline_stats: dict
|
|
597
|
+
- current_stats: dict
|
|
598
|
+
"""
|
|
599
|
+
columns = [
|
|
600
|
+
{
|
|
601
|
+
"column": col.column,
|
|
602
|
+
"dtype": col.dtype,
|
|
603
|
+
"drifted": col.result.drifted,
|
|
604
|
+
"level": (
|
|
605
|
+
col.result.level.value
|
|
606
|
+
if hasattr(col.result.level, "value")
|
|
607
|
+
else str(col.result.level)
|
|
608
|
+
),
|
|
609
|
+
"method": col.result.method,
|
|
610
|
+
"statistic": col.result.statistic,
|
|
611
|
+
"p_value": col.result.p_value,
|
|
612
|
+
"baseline_stats": col.baseline_stats,
|
|
613
|
+
"current_stats": col.current_stats,
|
|
614
|
+
}
|
|
615
|
+
for col in result.columns
|
|
616
|
+
]
|
|
617
|
+
|
|
618
|
+
return CompareResult(
|
|
619
|
+
baseline_source=result.baseline_source,
|
|
620
|
+
current_source=result.current_source,
|
|
621
|
+
baseline_rows=result.baseline_rows,
|
|
622
|
+
current_rows=result.current_rows,
|
|
623
|
+
has_drift=result.has_drift,
|
|
624
|
+
has_high_drift=result.has_high_drift,
|
|
625
|
+
total_columns=len(result.columns),
|
|
626
|
+
drifted_columns=result.get_drifted_columns(),
|
|
627
|
+
columns=columns,
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
def shutdown(self) -> None:
|
|
631
|
+
"""Shutdown the executor."""
|
|
632
|
+
self._executor.shutdown(wait=False)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
# Singleton instance
|
|
636
|
+
_adapter: TruthoundAdapter | None = None
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def get_adapter() -> TruthoundAdapter:
|
|
640
|
+
"""Get singleton adapter instance.
|
|
641
|
+
|
|
642
|
+
Returns:
|
|
643
|
+
TruthoundAdapter singleton.
|
|
644
|
+
"""
|
|
645
|
+
global _adapter
|
|
646
|
+
if _adapter is None:
|
|
647
|
+
from truthound_dashboard.config import get_settings
|
|
648
|
+
|
|
649
|
+
settings = get_settings()
|
|
650
|
+
_adapter = TruthoundAdapter(max_workers=settings.max_workers)
|
|
651
|
+
return _adapter
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def reset_adapter() -> None:
|
|
655
|
+
"""Reset adapter singleton (for testing)."""
|
|
656
|
+
global _adapter
|
|
657
|
+
if _adapter is not None:
|
|
658
|
+
_adapter.shutdown()
|
|
659
|
+
_adapter = None
|