duckguard 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/ai/__init__.py +33 -0
- duckguard/ai/config.py +201 -0
- duckguard/ai/explainer.py +109 -0
- duckguard/ai/fixer.py +105 -0
- duckguard/ai/natural_language.py +119 -0
- duckguard/ai/rules_generator.py +121 -0
- duckguard/checks/conditional.py +4 -3
- duckguard/cli/main.py +480 -93
- duckguard/core/column.py +15 -5
- duckguard/core/result.py +35 -14
- duckguard/profiler/auto_profile.py +217 -64
- duckguard/py.typed +0 -0
- duckguard/reports/html_reporter.py +522 -37
- duckguard/reports/pdf_reporter.py +33 -5
- duckguard/semantic/detector.py +18 -7
- duckguard-3.2.0.dist-info/METADATA +1206 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/RECORD +22 -14
- duckguard-3.2.0.dist-info/licenses/LICENSE +190 -0
- duckguard-3.2.0.dist-info/licenses/NOTICE +7 -0
- duckguard-3.0.1.dist-info/METADATA +0 -1072
- duckguard-3.0.1.dist-info/licenses/LICENSE +0 -55
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/entry_points.txt +0 -0
duckguard/cli/main.py
CHANGED
|
@@ -6,6 +6,7 @@ A modern, beautiful CLI for data quality that just works.
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
9
10
|
|
|
10
11
|
import typer
|
|
11
12
|
from rich.console import Console
|
|
@@ -28,11 +29,13 @@ console = Console()
|
|
|
28
29
|
def version_callback(value: bool) -> None:
|
|
29
30
|
"""Print version and exit."""
|
|
30
31
|
if value:
|
|
31
|
-
console.print(
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
32
|
+
console.print(
|
|
33
|
+
Panel(
|
|
34
|
+
f"[bold blue]DuckGuard[/bold blue] v{__version__}\n"
|
|
35
|
+
"[dim]The fast, simple data quality tool[/dim]",
|
|
36
|
+
border_style="blue",
|
|
37
|
+
)
|
|
38
|
+
)
|
|
36
39
|
raise typer.Exit()
|
|
37
40
|
|
|
38
41
|
|
|
@@ -54,10 +57,16 @@ def main(
|
|
|
54
57
|
@app.command()
|
|
55
58
|
def check(
|
|
56
59
|
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
57
|
-
config: str | None = typer.Option(
|
|
60
|
+
config: str | None = typer.Option(
|
|
61
|
+
None, "--config", "-c", help="Path to duckguard.yaml rules file"
|
|
62
|
+
),
|
|
58
63
|
table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
|
|
59
|
-
not_null: list[str] | None = typer.Option(
|
|
60
|
-
|
|
64
|
+
not_null: list[str] | None = typer.Option(
|
|
65
|
+
None, "--not-null", "-n", help="Columns that must not be null"
|
|
66
|
+
),
|
|
67
|
+
unique: list[str] | None = typer.Option(
|
|
68
|
+
None, "--unique", "-u", help="Columns that must be unique"
|
|
69
|
+
),
|
|
61
70
|
output: str | None = typer.Option(None, "--output", "-o", help="Output file (json)"),
|
|
62
71
|
verbose: bool = typer.Option(False, "--verbose", "-V", help="Verbose output"),
|
|
63
72
|
) -> None:
|
|
@@ -115,7 +124,9 @@ def check(
|
|
|
115
124
|
results = []
|
|
116
125
|
|
|
117
126
|
# Row count check
|
|
118
|
-
results.append(
|
|
127
|
+
results.append(
|
|
128
|
+
("Row count > 0", dataset.row_count > 0, f"{dataset.row_count:,} rows", None)
|
|
129
|
+
)
|
|
119
130
|
|
|
120
131
|
# Not null checks
|
|
121
132
|
if not_null:
|
|
@@ -123,14 +134,18 @@ def check(
|
|
|
123
134
|
if col_name in dataset.columns:
|
|
124
135
|
col = dataset[col_name]
|
|
125
136
|
passed = col.null_count == 0
|
|
126
|
-
results.append(
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
137
|
+
results.append(
|
|
138
|
+
(
|
|
139
|
+
f"{col_name} not null",
|
|
140
|
+
passed,
|
|
141
|
+
f"{col.null_count:,} nulls ({col.null_percent:.1f}%)",
|
|
142
|
+
col_name,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
132
145
|
else:
|
|
133
|
-
results.append(
|
|
146
|
+
results.append(
|
|
147
|
+
(f"{col_name} not null", False, "Column not found", col_name)
|
|
148
|
+
)
|
|
134
149
|
|
|
135
150
|
# Unique checks
|
|
136
151
|
if unique:
|
|
@@ -139,12 +154,14 @@ def check(
|
|
|
139
154
|
col = dataset[col_name]
|
|
140
155
|
passed = col.unique_percent == 100
|
|
141
156
|
dup_count = col.total_count - col.unique_count
|
|
142
|
-
results.append(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
157
|
+
results.append(
|
|
158
|
+
(
|
|
159
|
+
f"{col_name} unique",
|
|
160
|
+
passed,
|
|
161
|
+
f"{col.unique_percent:.1f}% unique ({dup_count:,} duplicates)",
|
|
162
|
+
col_name,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
148
165
|
else:
|
|
149
166
|
results.append((f"{col_name} unique", False, "Column not found", col_name))
|
|
150
167
|
|
|
@@ -179,7 +196,9 @@ def check(
|
|
|
179
196
|
def discover(
|
|
180
197
|
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
181
198
|
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
182
|
-
output: str | None = typer.Option(
|
|
199
|
+
output: str | None = typer.Option(
|
|
200
|
+
None, "--output", "-o", help="Output file for rules (duckguard.yaml)"
|
|
201
|
+
),
|
|
183
202
|
format: str = typer.Option("yaml", "--format", "-f", help="Output format: yaml, python"),
|
|
184
203
|
) -> None:
|
|
185
204
|
"""
|
|
@@ -228,17 +247,181 @@ def discover(
|
|
|
228
247
|
else:
|
|
229
248
|
# Display YAML
|
|
230
249
|
yaml_content = ruleset_to_yaml(ruleset)
|
|
231
|
-
console.print(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
250
|
+
console.print(
|
|
251
|
+
Panel(
|
|
252
|
+
Syntax(yaml_content, "yaml", theme="monokai"),
|
|
253
|
+
title="Generated Rules (duckguard.yaml)",
|
|
254
|
+
border_style="green",
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
except Exception as e:
|
|
259
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
260
|
+
raise typer.Exit(1)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@app.command(name="profile")
|
|
264
|
+
def profile_command(
|
|
265
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
266
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
|
|
267
|
+
deep: bool = typer.Option(
|
|
268
|
+
False, "--deep", "-d", help="Enable deep profiling (distribution, outliers)"
|
|
269
|
+
),
|
|
270
|
+
output: str | None = typer.Option(None, "--output", "-o", help="Output file (json)"),
|
|
271
|
+
output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
|
|
272
|
+
) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Profile a data source and suggest validation rules.
|
|
275
|
+
|
|
276
|
+
Analyzes data patterns, statistics, and quality to generate
|
|
277
|
+
a comprehensive profile with rule suggestions.
|
|
278
|
+
|
|
279
|
+
[bold]Examples:[/bold]
|
|
280
|
+
duckguard profile data.csv
|
|
281
|
+
duckguard profile data.csv --deep
|
|
282
|
+
duckguard profile data.csv --format json
|
|
283
|
+
duckguard profile postgres://localhost/db --table orders
|
|
284
|
+
"""
|
|
285
|
+
import json as json_module
|
|
286
|
+
|
|
287
|
+
from duckguard.connectors import connect
|
|
288
|
+
from duckguard.profiler import AutoProfiler
|
|
289
|
+
|
|
290
|
+
if output_format != "json":
|
|
291
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Profiling: [cyan]{source}[/cyan]\n")
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
with Progress(
|
|
295
|
+
SpinnerColumn(),
|
|
296
|
+
TextColumn("[progress.description]{task.description}"),
|
|
297
|
+
console=console,
|
|
298
|
+
transient=True,
|
|
299
|
+
) as progress:
|
|
300
|
+
_task = progress.add_task("Profiling data...", total=None) # noqa: F841
|
|
301
|
+
dataset = connect(source, table=table)
|
|
302
|
+
profiler = AutoProfiler(deep=deep)
|
|
303
|
+
result = profiler.profile(dataset)
|
|
304
|
+
|
|
305
|
+
if output_format == "json":
|
|
306
|
+
data = _profile_to_dict(result)
|
|
307
|
+
json_str = json_module.dumps(data, indent=2, default=str)
|
|
308
|
+
if output:
|
|
309
|
+
Path(output).write_text(json_str, encoding="utf-8")
|
|
310
|
+
console.print(f"[green]SAVED[/green] Profile saved to [cyan]{output}[/cyan]")
|
|
311
|
+
else:
|
|
312
|
+
print(json_str)
|
|
313
|
+
else:
|
|
314
|
+
_display_profile_result(result)
|
|
315
|
+
|
|
316
|
+
if output:
|
|
317
|
+
data = _profile_to_dict(result)
|
|
318
|
+
Path(output).write_text(
|
|
319
|
+
json_module.dumps(data, indent=2, default=str), encoding="utf-8"
|
|
320
|
+
)
|
|
321
|
+
console.print(f"\n[green]SAVED[/green] Profile saved to [cyan]{output}[/cyan]")
|
|
236
322
|
|
|
237
323
|
except Exception as e:
|
|
238
324
|
console.print(f"[red]Error:[/red] {e}")
|
|
239
325
|
raise typer.Exit(1)
|
|
240
326
|
|
|
241
327
|
|
|
328
|
+
def _display_profile_result(result: Any) -> None:
|
|
329
|
+
"""Display profiling results in a rich table."""
|
|
330
|
+
_grade_colors = {"A": "green", "B": "blue", "C": "yellow", "D": "orange1", "F": "red"}
|
|
331
|
+
|
|
332
|
+
summary_parts = [
|
|
333
|
+
f"Rows: [cyan]{result.row_count:,}[/cyan]",
|
|
334
|
+
f"Columns: [cyan]{result.column_count}[/cyan]",
|
|
335
|
+
f"Rules Suggested: [cyan]{len(result.suggested_rules)}[/cyan]",
|
|
336
|
+
]
|
|
337
|
+
if result.overall_quality_score is not None:
|
|
338
|
+
color = _grade_colors.get(result.overall_quality_grade, "white")
|
|
339
|
+
summary_parts.append(
|
|
340
|
+
f"Quality: [{color}]{result.overall_quality_score:.0f}/100 "
|
|
341
|
+
f"({result.overall_quality_grade})[/{color}]"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
console.print(Panel("\n".join(summary_parts), title="Profile Summary", border_style="blue"))
|
|
345
|
+
console.print()
|
|
346
|
+
|
|
347
|
+
col_table = Table(title="Column Profiles")
|
|
348
|
+
col_table.add_column("Column", style="cyan")
|
|
349
|
+
col_table.add_column("Type", style="magenta")
|
|
350
|
+
col_table.add_column("Nulls", justify="right")
|
|
351
|
+
col_table.add_column("Unique", justify="right")
|
|
352
|
+
col_table.add_column("Min", justify="right")
|
|
353
|
+
col_table.add_column("Max", justify="right")
|
|
354
|
+
col_table.add_column("Grade", justify="center")
|
|
355
|
+
col_table.add_column("Rules", justify="right")
|
|
356
|
+
|
|
357
|
+
for col in result.columns:
|
|
358
|
+
grade_str = ""
|
|
359
|
+
if col.quality_grade:
|
|
360
|
+
color = _grade_colors.get(col.quality_grade, "white")
|
|
361
|
+
grade_str = f"[{color}]{col.quality_grade}[/{color}]"
|
|
362
|
+
|
|
363
|
+
col_table.add_row(
|
|
364
|
+
col.name,
|
|
365
|
+
col.dtype,
|
|
366
|
+
f"{col.null_percent:.1f}%",
|
|
367
|
+
f"{col.unique_percent:.1f}%",
|
|
368
|
+
str(col.min_value) if col.min_value is not None else "-",
|
|
369
|
+
str(col.max_value) if col.max_value is not None else "-",
|
|
370
|
+
grade_str or "-",
|
|
371
|
+
str(len(col.suggested_rules)),
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
console.print(col_table)
|
|
375
|
+
|
|
376
|
+
if result.suggested_rules:
|
|
377
|
+
console.print()
|
|
378
|
+
console.print(f"[bold]Suggested Rules ({len(result.suggested_rules)}):[/bold]")
|
|
379
|
+
for rule in result.suggested_rules[:20]:
|
|
380
|
+
console.print(f" {rule}")
|
|
381
|
+
if len(result.suggested_rules) > 20:
|
|
382
|
+
console.print(f" [dim]... and {len(result.suggested_rules) - 20} more[/dim]")
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _profile_to_dict(result: Any) -> dict[str, Any]:
|
|
386
|
+
"""Convert ProfileResult to a JSON-serializable dict."""
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
"source": result.source,
|
|
390
|
+
"row_count": result.row_count,
|
|
391
|
+
"column_count": result.column_count,
|
|
392
|
+
"overall_quality_score": result.overall_quality_score,
|
|
393
|
+
"overall_quality_grade": result.overall_quality_grade,
|
|
394
|
+
"columns": [
|
|
395
|
+
{
|
|
396
|
+
"name": col.name,
|
|
397
|
+
"dtype": col.dtype,
|
|
398
|
+
"null_count": col.null_count,
|
|
399
|
+
"null_percent": col.null_percent,
|
|
400
|
+
"unique_count": col.unique_count,
|
|
401
|
+
"unique_percent": col.unique_percent,
|
|
402
|
+
"min_value": col.min_value,
|
|
403
|
+
"max_value": col.max_value,
|
|
404
|
+
"mean_value": col.mean_value,
|
|
405
|
+
"stddev_value": col.stddev_value,
|
|
406
|
+
"median_value": col.median_value,
|
|
407
|
+
"p25_value": col.p25_value,
|
|
408
|
+
"p75_value": col.p75_value,
|
|
409
|
+
"quality_score": col.quality_score,
|
|
410
|
+
"quality_grade": col.quality_grade,
|
|
411
|
+
"distribution_type": col.distribution_type,
|
|
412
|
+
"skewness": col.skewness,
|
|
413
|
+
"kurtosis": col.kurtosis,
|
|
414
|
+
"is_normal": col.is_normal,
|
|
415
|
+
"outlier_count": col.outlier_count,
|
|
416
|
+
"outlier_percentage": col.outlier_percentage,
|
|
417
|
+
"suggested_rules": col.suggested_rules,
|
|
418
|
+
}
|
|
419
|
+
for col in result.columns
|
|
420
|
+
],
|
|
421
|
+
"suggested_rules": result.suggested_rules,
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
|
|
242
425
|
@app.command()
|
|
243
426
|
def contract(
|
|
244
427
|
action: str = typer.Argument(..., help="Action: generate, validate, diff"),
|
|
@@ -274,7 +457,9 @@ def contract(
|
|
|
274
457
|
console.print("[red]Error:[/red] Source required for generate")
|
|
275
458
|
raise typer.Exit(1)
|
|
276
459
|
|
|
277
|
-
console.print(
|
|
460
|
+
console.print(
|
|
461
|
+
f"\n[bold blue]DuckGuard[/bold blue] Generating contract for: [cyan]{source}[/cyan]\n"
|
|
462
|
+
)
|
|
278
463
|
|
|
279
464
|
with Progress(
|
|
280
465
|
SpinnerColumn(),
|
|
@@ -338,10 +523,16 @@ def contract(
|
|
|
338
523
|
def anomaly(
|
|
339
524
|
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
340
525
|
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
341
|
-
method: str = typer.Option(
|
|
526
|
+
method: str = typer.Option(
|
|
527
|
+
"zscore", "--method", "-m", help="Method: zscore, iqr, percent_change, baseline, ks_test"
|
|
528
|
+
),
|
|
342
529
|
threshold: float | None = typer.Option(None, "--threshold", help="Detection threshold"),
|
|
343
|
-
columns: list[str] | None = typer.Option(
|
|
344
|
-
|
|
530
|
+
columns: list[str] | None = typer.Option(
|
|
531
|
+
None, "--column", "-c", help="Specific columns to check"
|
|
532
|
+
),
|
|
533
|
+
learn_baseline: bool = typer.Option(
|
|
534
|
+
False, "--learn-baseline", "-L", help="Learn and store baseline from current data"
|
|
535
|
+
),
|
|
345
536
|
) -> None:
|
|
346
537
|
"""
|
|
347
538
|
Detect anomalies in data.
|
|
@@ -364,7 +555,9 @@ def anomaly(
|
|
|
364
555
|
from duckguard.anomaly import detect_anomalies
|
|
365
556
|
from duckguard.connectors import connect
|
|
366
557
|
|
|
367
|
-
console.print(
|
|
558
|
+
console.print(
|
|
559
|
+
f"\n[bold blue]DuckGuard[/bold blue] Detecting anomalies in: [cyan]{source}[/cyan]\n"
|
|
560
|
+
)
|
|
368
561
|
|
|
369
562
|
try:
|
|
370
563
|
with Progress(
|
|
@@ -401,7 +594,9 @@ def anomaly(
|
|
|
401
594
|
learned += 1
|
|
402
595
|
|
|
403
596
|
console.print(f"[green]LEARNED[/green] Baselines stored for {learned} columns")
|
|
404
|
-
console.print(
|
|
597
|
+
console.print(
|
|
598
|
+
"[dim]Use --method baseline to compare against stored baselines[/dim]"
|
|
599
|
+
)
|
|
405
600
|
return
|
|
406
601
|
|
|
407
602
|
# Regular anomaly detection
|
|
@@ -441,10 +636,7 @@ def info(
|
|
|
441
636
|
dataset = connect(source, table=table)
|
|
442
637
|
analyzer = SemanticAnalyzer()
|
|
443
638
|
|
|
444
|
-
console.print(Panel(
|
|
445
|
-
f"[bold]{dataset.name}[/bold]",
|
|
446
|
-
border_style="blue"
|
|
447
|
-
))
|
|
639
|
+
console.print(Panel(f"[bold]{dataset.name}[/bold]", border_style="blue"))
|
|
448
640
|
|
|
449
641
|
# Basic info
|
|
450
642
|
info_table = Table(show_header=False, box=None)
|
|
@@ -496,6 +688,7 @@ def info(
|
|
|
496
688
|
|
|
497
689
|
# Helper display functions
|
|
498
690
|
|
|
691
|
+
|
|
499
692
|
def _display_execution_result(result, verbose: bool = False) -> None:
|
|
500
693
|
"""Display rule execution results."""
|
|
501
694
|
table = Table(title="Validation Results")
|
|
@@ -552,11 +745,13 @@ def _display_quality_score(quality) -> None:
|
|
|
552
745
|
color = grade_colors.get(quality.grade, "white")
|
|
553
746
|
|
|
554
747
|
console.print()
|
|
555
|
-
console.print(
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
748
|
+
console.print(
|
|
749
|
+
Panel(
|
|
750
|
+
f"[bold]Quality Score: [{color}]{quality.overall:.0f}/100[/{color}] "
|
|
751
|
+
f"(Grade: [{color}]{quality.grade}[/{color}])[/bold]",
|
|
752
|
+
border_style=color,
|
|
753
|
+
)
|
|
754
|
+
)
|
|
560
755
|
|
|
561
756
|
|
|
562
757
|
def _display_discovery_results(analysis, ruleset) -> None:
|
|
@@ -566,11 +761,13 @@ def _display_discovery_results(analysis, ruleset) -> None:
|
|
|
566
761
|
|
|
567
762
|
# PII warning
|
|
568
763
|
if analysis.pii_columns:
|
|
569
|
-
console.print(
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
764
|
+
console.print(
|
|
765
|
+
Panel(
|
|
766
|
+
"[yellow]WARNING: PII Detected[/yellow]\n"
|
|
767
|
+
+ "\n".join(f" - {col}" for col in analysis.pii_columns),
|
|
768
|
+
border_style="yellow",
|
|
769
|
+
)
|
|
770
|
+
)
|
|
574
771
|
console.print()
|
|
575
772
|
|
|
576
773
|
# Column analysis table
|
|
@@ -611,7 +808,7 @@ def _display_contract(contract) -> None:
|
|
|
611
808
|
table.add_column("PII")
|
|
612
809
|
|
|
613
810
|
for field_obj in contract.schema[:15]:
|
|
614
|
-
type_str = field_obj.type.value if hasattr(field_obj.type,
|
|
811
|
+
type_str = field_obj.type.value if hasattr(field_obj.type, "value") else str(field_obj.type)
|
|
615
812
|
table.add_row(
|
|
616
813
|
field_obj.name,
|
|
617
814
|
type_str,
|
|
@@ -645,7 +842,9 @@ def _display_contract_validation(result) -> None:
|
|
|
645
842
|
table.add_column("Severity")
|
|
646
843
|
|
|
647
844
|
for v in result.violations[:20]:
|
|
648
|
-
sev_style = {"error": "red", "warning": "yellow", "info": "dim"}.get(
|
|
845
|
+
sev_style = {"error": "red", "warning": "yellow", "info": "dim"}.get(
|
|
846
|
+
v.severity.value, "white"
|
|
847
|
+
)
|
|
649
848
|
table.add_row(
|
|
650
849
|
v.type.value,
|
|
651
850
|
v.field or "-",
|
|
@@ -696,7 +895,9 @@ def _display_anomaly_report(report) -> None:
|
|
|
696
895
|
console.print("[green]No anomalies detected[/green]")
|
|
697
896
|
return
|
|
698
897
|
|
|
699
|
-
console.print(
|
|
898
|
+
console.print(
|
|
899
|
+
f"[yellow bold]WARNING: {report.anomaly_count} anomalies detected[/yellow bold]\n"
|
|
900
|
+
)
|
|
700
901
|
|
|
701
902
|
table = Table(title="Anomalies")
|
|
702
903
|
table.add_column("Column", style="cyan")
|
|
@@ -727,10 +928,7 @@ def _save_results(output: str, dataset, results) -> None:
|
|
|
727
928
|
}
|
|
728
929
|
|
|
729
930
|
if results:
|
|
730
|
-
data["checks"] = [
|
|
731
|
-
{"name": r[0], "passed": r[1], "details": r[2]}
|
|
732
|
-
for r in results
|
|
733
|
-
]
|
|
931
|
+
data["checks"] = [{"name": r[0], "passed": r[1], "details": r[2]} for r in results]
|
|
734
932
|
|
|
735
933
|
Path(output).write_text(json.dumps(data, indent=2))
|
|
736
934
|
|
|
@@ -767,7 +965,9 @@ def history(
|
|
|
767
965
|
|
|
768
966
|
if trend and source:
|
|
769
967
|
# Show trend analysis
|
|
770
|
-
console.print(
|
|
968
|
+
console.print(
|
|
969
|
+
f"\n[bold blue]DuckGuard[/bold blue] Trend Analysis: [cyan]{source}[/cyan]\n"
|
|
970
|
+
)
|
|
771
971
|
|
|
772
972
|
analyzer = TrendAnalyzer(storage)
|
|
773
973
|
analysis = analyzer.analyze(source, days=days)
|
|
@@ -790,20 +990,24 @@ def history(
|
|
|
790
990
|
"stable": "[=]",
|
|
791
991
|
}.get(analysis.score_trend, "[=]")
|
|
792
992
|
|
|
793
|
-
console.print(
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
993
|
+
console.print(
|
|
994
|
+
Panel(
|
|
995
|
+
f"[bold]Quality Trend: [{trend_color}]{trend_symbol} {analysis.score_trend.upper()}[/{trend_color}][/bold]\n\n"
|
|
996
|
+
f"Current Score: [cyan]{analysis.current_score:.1f}%[/cyan]\n"
|
|
997
|
+
f"Average Score: [cyan]{analysis.average_score:.1f}%[/cyan]\n"
|
|
998
|
+
f"Min/Max: [dim]{analysis.min_score:.1f}% - {analysis.max_score:.1f}%[/dim]\n"
|
|
999
|
+
f"Change: [{trend_color}]{analysis.trend_change:+.1f}%[/{trend_color}]\n"
|
|
1000
|
+
f"Total Runs: [cyan]{analysis.total_runs}[/cyan]\n"
|
|
1001
|
+
f"Pass Rate: [cyan]{analysis.pass_rate:.1f}%[/cyan]",
|
|
1002
|
+
title=f"Last {days} Days",
|
|
1003
|
+
border_style=trend_color,
|
|
1004
|
+
)
|
|
1005
|
+
)
|
|
804
1006
|
|
|
805
1007
|
if analysis.anomalies:
|
|
806
|
-
console.print(
|
|
1008
|
+
console.print(
|
|
1009
|
+
f"\n[yellow]Anomalies detected on: {', '.join(analysis.anomalies)}[/yellow]"
|
|
1010
|
+
)
|
|
807
1011
|
|
|
808
1012
|
# Show daily data if available
|
|
809
1013
|
if analysis.daily_data and len(analysis.daily_data) <= 14:
|
|
@@ -816,7 +1020,11 @@ def history(
|
|
|
816
1020
|
|
|
817
1021
|
for day in analysis.daily_data:
|
|
818
1022
|
pass_rate = (day.passed_count / day.run_count * 100) if day.run_count > 0 else 0
|
|
819
|
-
score_style =
|
|
1023
|
+
score_style = (
|
|
1024
|
+
"green"
|
|
1025
|
+
if day.avg_score >= 80
|
|
1026
|
+
else "yellow" if day.avg_score >= 60 else "red"
|
|
1027
|
+
)
|
|
820
1028
|
table.add_row(
|
|
821
1029
|
day.date,
|
|
822
1030
|
f"[{score_style}]{day.avg_score:.1f}%[/{score_style}]",
|
|
@@ -829,7 +1037,9 @@ def history(
|
|
|
829
1037
|
else:
|
|
830
1038
|
# Show run history
|
|
831
1039
|
if source:
|
|
832
|
-
console.print(
|
|
1040
|
+
console.print(
|
|
1041
|
+
f"\n[bold blue]DuckGuard[/bold blue] History: [cyan]{source}[/cyan]\n"
|
|
1042
|
+
)
|
|
833
1043
|
runs = storage.get_runs(source, limit=20)
|
|
834
1044
|
else:
|
|
835
1045
|
console.print("\n[bold blue]DuckGuard[/bold blue] Recent Validation History\n")
|
|
@@ -867,7 +1077,11 @@ def history(
|
|
|
867
1077
|
table.add_column("Checks", justify="right")
|
|
868
1078
|
|
|
869
1079
|
for run in runs:
|
|
870
|
-
score_style =
|
|
1080
|
+
score_style = (
|
|
1081
|
+
"green"
|
|
1082
|
+
if run.quality_score >= 80
|
|
1083
|
+
else "yellow" if run.quality_score >= 60 else "red"
|
|
1084
|
+
)
|
|
871
1085
|
status = "[green]PASS[/green]" if run.passed else "[red]FAIL[/red]"
|
|
872
1086
|
|
|
873
1087
|
table.add_row(
|
|
@@ -893,27 +1107,42 @@ def history(
|
|
|
893
1107
|
@app.command()
|
|
894
1108
|
def report(
|
|
895
1109
|
source: str = typer.Argument(..., help="Data source path or connection string"),
|
|
896
|
-
config: str | None = typer.Option(
|
|
1110
|
+
config: str | None = typer.Option(
|
|
1111
|
+
None, "--config", "-c", help="Path to duckguard.yaml rules file"
|
|
1112
|
+
),
|
|
897
1113
|
table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
|
|
898
1114
|
output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, pdf"),
|
|
899
1115
|
output: str = typer.Option("report.html", "--output", "-o", help="Output file path"),
|
|
900
1116
|
title: str = typer.Option("DuckGuard Data Quality Report", "--title", help="Report title"),
|
|
901
|
-
include_passed: bool = typer.Option(
|
|
1117
|
+
include_passed: bool = typer.Option(
|
|
1118
|
+
True, "--include-passed/--no-passed", help="Include passed checks"
|
|
1119
|
+
),
|
|
902
1120
|
store: bool = typer.Option(False, "--store", "-s", help="Store results in history"),
|
|
1121
|
+
trends: bool = typer.Option(
|
|
1122
|
+
False, "--trends", help="Include quality trend charts from history"
|
|
1123
|
+
),
|
|
1124
|
+
trend_days: int = typer.Option(
|
|
1125
|
+
30, "--trend-days", help="Number of days of history for trend charts"
|
|
1126
|
+
),
|
|
1127
|
+
dark_mode: str = typer.Option("auto", "--dark-mode", help="Theme mode: auto, light, dark"),
|
|
1128
|
+
logo: str | None = typer.Option(None, "--logo", help="Logo URL or data URI for report header"),
|
|
903
1129
|
) -> None:
|
|
904
1130
|
"""
|
|
905
1131
|
Generate a data quality report (HTML or PDF).
|
|
906
1132
|
|
|
907
|
-
Runs validation checks and generates a beautiful, shareable report
|
|
1133
|
+
Runs validation checks and generates a beautiful, shareable report
|
|
1134
|
+
with dark mode, interactive tables, and optional trend charts.
|
|
908
1135
|
|
|
909
1136
|
[bold]Examples:[/bold]
|
|
910
1137
|
duckguard report data.csv
|
|
911
1138
|
duckguard report data.csv --format pdf --output report.pdf
|
|
912
1139
|
duckguard report data.csv --config rules.yaml --title "Orders Quality"
|
|
913
1140
|
duckguard report data.csv --store # Also save to history
|
|
1141
|
+
duckguard report data.csv --trends # Include quality trend charts
|
|
1142
|
+
duckguard report data.csv --dark-mode dark # Force dark theme
|
|
914
1143
|
"""
|
|
915
1144
|
from duckguard.connectors import connect
|
|
916
|
-
from duckguard.reports import
|
|
1145
|
+
from duckguard.reports import HTMLReporter, PDFReporter, ReportConfig
|
|
917
1146
|
from duckguard.rules import execute_rules, generate_rules, load_rules
|
|
918
1147
|
|
|
919
1148
|
# Determine output path based on format
|
|
@@ -964,6 +1193,31 @@ def report(
|
|
|
964
1193
|
console.print(f"Quality Score: [cyan]{result.quality_score:.1f}%[/cyan]")
|
|
965
1194
|
console.print(f"Checks: {result.passed_count}/{result.total_checks} passed\n")
|
|
966
1195
|
|
|
1196
|
+
# Load trend data if requested
|
|
1197
|
+
trend_data = None
|
|
1198
|
+
history_runs = None
|
|
1199
|
+
if trends:
|
|
1200
|
+
from duckguard.history import HistoryStorage
|
|
1201
|
+
|
|
1202
|
+
try:
|
|
1203
|
+
storage_for_trends = HistoryStorage()
|
|
1204
|
+
trend_data = storage_for_trends.get_trend(source, days=trend_days)
|
|
1205
|
+
history_runs = storage_for_trends.get_runs(source, limit=20)
|
|
1206
|
+
if not trend_data:
|
|
1207
|
+
console.print("[dim]No historical data found for trend charts[/dim]")
|
|
1208
|
+
except Exception:
|
|
1209
|
+
console.print("[dim]No historical data found for trend charts[/dim]")
|
|
1210
|
+
|
|
1211
|
+
# Build report config
|
|
1212
|
+
report_config = ReportConfig(
|
|
1213
|
+
title=title,
|
|
1214
|
+
include_passed=include_passed,
|
|
1215
|
+
include_trends=trends,
|
|
1216
|
+
trend_days=trend_days,
|
|
1217
|
+
dark_mode=dark_mode,
|
|
1218
|
+
logo_url=logo,
|
|
1219
|
+
)
|
|
1220
|
+
|
|
967
1221
|
# Generate report
|
|
968
1222
|
with Progress(
|
|
969
1223
|
SpinnerColumn(),
|
|
@@ -974,9 +1228,18 @@ def report(
|
|
|
974
1228
|
progress.add_task(f"Generating {output_format.upper()} report...", total=None)
|
|
975
1229
|
|
|
976
1230
|
if output_format.lower() == "pdf":
|
|
977
|
-
|
|
1231
|
+
reporter = PDFReporter(config=report_config)
|
|
978
1232
|
else:
|
|
979
|
-
|
|
1233
|
+
reporter = HTMLReporter(config=report_config)
|
|
1234
|
+
|
|
1235
|
+
reporter.generate(
|
|
1236
|
+
result,
|
|
1237
|
+
output,
|
|
1238
|
+
history=history_runs,
|
|
1239
|
+
trend_data=trend_data,
|
|
1240
|
+
row_count=dataset.row_count,
|
|
1241
|
+
column_count=dataset.column_count,
|
|
1242
|
+
)
|
|
980
1243
|
|
|
981
1244
|
console.print(f"[green]SAVED[/green] Report saved to [cyan]{output}[/cyan]")
|
|
982
1245
|
console.print("[dim]Open in browser to view the report[/dim]")
|
|
@@ -997,7 +1260,9 @@ def report(
|
|
|
997
1260
|
def freshness(
|
|
998
1261
|
source: str = typer.Argument(..., help="Data source path"),
|
|
999
1262
|
column: str | None = typer.Option(None, "--column", "-c", help="Timestamp column to check"),
|
|
1000
|
-
max_age: str = typer.Option(
|
|
1263
|
+
max_age: str = typer.Option(
|
|
1264
|
+
"24h", "--max-age", "-m", help="Maximum acceptable age: 1h, 6h, 24h, 7d"
|
|
1265
|
+
),
|
|
1001
1266
|
output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
|
|
1002
1267
|
) -> None:
|
|
1003
1268
|
"""
|
|
@@ -1038,6 +1303,7 @@ def freshness(
|
|
|
1038
1303
|
else:
|
|
1039
1304
|
# Try file mtime first, fallback to dataset
|
|
1040
1305
|
from pathlib import Path
|
|
1306
|
+
|
|
1041
1307
|
if Path(source).exists():
|
|
1042
1308
|
result = monitor.check_file_mtime(source)
|
|
1043
1309
|
else:
|
|
@@ -1051,15 +1317,17 @@ def freshness(
|
|
|
1051
1317
|
status_color = "green" if result.is_fresh else "red"
|
|
1052
1318
|
status_text = "FRESH" if result.is_fresh else "STALE"
|
|
1053
1319
|
|
|
1054
|
-
console.print(
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1320
|
+
console.print(
|
|
1321
|
+
Panel(
|
|
1322
|
+
f"[bold {status_color}]{status_text}[/bold {status_color}]\n\n"
|
|
1323
|
+
f"Last Modified: [cyan]{result.last_modified.strftime('%Y-%m-%d %H:%M:%S') if result.last_modified else 'Unknown'}[/cyan]\n"
|
|
1324
|
+
f"Age: [cyan]{result.age_human}[/cyan]\n"
|
|
1325
|
+
f"Threshold: [dim]{max_age}[/dim]\n"
|
|
1326
|
+
f"Method: [dim]{result.method.value}[/dim]",
|
|
1327
|
+
title="Freshness Check",
|
|
1328
|
+
border_style=status_color,
|
|
1329
|
+
)
|
|
1330
|
+
)
|
|
1063
1331
|
|
|
1064
1332
|
if not result.is_fresh:
|
|
1065
1333
|
raise typer.Exit(1)
|
|
@@ -1072,7 +1340,9 @@ def freshness(
|
|
|
1072
1340
|
@app.command()
|
|
1073
1341
|
def schema(
|
|
1074
1342
|
source: str = typer.Argument(..., help="Data source path"),
|
|
1075
|
-
action: str = typer.Option(
|
|
1343
|
+
action: str = typer.Option(
|
|
1344
|
+
"show", "--action", "-a", help="Action: show, capture, history, changes"
|
|
1345
|
+
),
|
|
1076
1346
|
table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
|
|
1077
1347
|
output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
|
|
1078
1348
|
limit: int = typer.Option(10, "--limit", "-l", help="Number of results to show"),
|
|
@@ -1132,9 +1402,15 @@ def schema(
|
|
|
1132
1402
|
progress.add_task("Capturing schema snapshot...", total=None)
|
|
1133
1403
|
snapshot = tracker.capture(dataset)
|
|
1134
1404
|
|
|
1135
|
-
console.print(
|
|
1136
|
-
|
|
1137
|
-
|
|
1405
|
+
console.print(
|
|
1406
|
+
f"[green]CAPTURED[/green] Schema snapshot: [cyan]{snapshot.snapshot_id[:8]}...[/cyan]"
|
|
1407
|
+
)
|
|
1408
|
+
console.print(
|
|
1409
|
+
f"[dim]Columns: {snapshot.column_count} | Rows: {snapshot.row_count:,}[/dim]"
|
|
1410
|
+
)
|
|
1411
|
+
console.print(
|
|
1412
|
+
f"[dim]Captured at: {snapshot.captured_at.strftime('%Y-%m-%d %H:%M:%S')}[/dim]"
|
|
1413
|
+
)
|
|
1138
1414
|
|
|
1139
1415
|
elif action == "history":
|
|
1140
1416
|
history = tracker.get_history(source, limit=limit)
|
|
@@ -1176,11 +1452,15 @@ def schema(
|
|
|
1176
1452
|
|
|
1177
1453
|
if not report.has_changes:
|
|
1178
1454
|
console.print("[green]No schema changes detected[/green]")
|
|
1179
|
-
console.print(
|
|
1455
|
+
console.print(
|
|
1456
|
+
f"[dim]Snapshot captured: {report.current_snapshot.snapshot_id[:8]}...[/dim]"
|
|
1457
|
+
)
|
|
1180
1458
|
return
|
|
1181
1459
|
|
|
1182
1460
|
# Display changes
|
|
1183
|
-
console.print(
|
|
1461
|
+
console.print(
|
|
1462
|
+
f"[yellow bold]{len(report.changes)} schema changes detected[/yellow bold]\n"
|
|
1463
|
+
)
|
|
1184
1464
|
|
|
1185
1465
|
if report.has_breaking_changes:
|
|
1186
1466
|
console.print("[red bold]BREAKING CHANGES:[/red bold]")
|
|
@@ -1207,5 +1487,112 @@ def schema(
|
|
|
1207
1487
|
raise typer.Exit(1)
|
|
1208
1488
|
|
|
1209
1489
|
|
|
1490
|
+
# =========================================================================
|
|
1491
|
+
# AI-Powered Commands
|
|
1492
|
+
# =========================================================================
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
@app.command()
|
|
1496
|
+
def explain(
|
|
1497
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1498
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1499
|
+
focus: str | None = typer.Option(None, "--focus", "-f", help="Column or aspect to focus on"),
|
|
1500
|
+
detail: str = typer.Option("medium", "--detail", "-d", help="Detail level: brief, medium, detailed"),
|
|
1501
|
+
) -> None:
|
|
1502
|
+
"""Explain data quality issues in plain English (AI-powered).
|
|
1503
|
+
|
|
1504
|
+
Requires: pip install duckguard[llm]
|
|
1505
|
+
"""
|
|
1506
|
+
try:
|
|
1507
|
+
from duckguard.ai import explain as ai_explain
|
|
1508
|
+
from duckguard.connectors import connect as dg_connect
|
|
1509
|
+
|
|
1510
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1511
|
+
dataset = dg_connect(source, table=table)
|
|
1512
|
+
|
|
1513
|
+
with console.status("[bold green]Analyzing with AI..."):
|
|
1514
|
+
result = ai_explain(dataset, focus=focus, detail=detail)
|
|
1515
|
+
|
|
1516
|
+
console.print()
|
|
1517
|
+
console.print(Panel(result, title="[bold]Data Quality Explanation[/bold]", border_style="green"))
|
|
1518
|
+
|
|
1519
|
+
except ImportError:
|
|
1520
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1521
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1522
|
+
raise typer.Exit(1)
|
|
1523
|
+
except Exception as e:
|
|
1524
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1525
|
+
raise typer.Exit(1)
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
@app.command()
|
|
1529
|
+
def suggest(
|
|
1530
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1531
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1532
|
+
output: str | None = typer.Option(None, "--output", "-o", help="Output file (default: stdout)"),
|
|
1533
|
+
strict: bool = typer.Option(False, "--strict", help="Generate stricter rules"),
|
|
1534
|
+
) -> None:
|
|
1535
|
+
"""Generate validation rules using AI (AI-powered).
|
|
1536
|
+
|
|
1537
|
+
Requires: pip install duckguard[llm]
|
|
1538
|
+
"""
|
|
1539
|
+
try:
|
|
1540
|
+
from duckguard.ai import suggest_rules
|
|
1541
|
+
from duckguard.connectors import connect as dg_connect
|
|
1542
|
+
|
|
1543
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1544
|
+
dataset = dg_connect(source, table=table)
|
|
1545
|
+
|
|
1546
|
+
with console.status("[bold green]Generating rules with AI..."):
|
|
1547
|
+
rules_yaml = suggest_rules(dataset, strict=strict)
|
|
1548
|
+
|
|
1549
|
+
if output:
|
|
1550
|
+
with open(output, "w") as f:
|
|
1551
|
+
f.write(rules_yaml)
|
|
1552
|
+
console.print(f"[green]Rules written to {output}[/green]")
|
|
1553
|
+
else:
|
|
1554
|
+
console.print()
|
|
1555
|
+
console.print(Syntax(rules_yaml, "yaml", theme="monokai"))
|
|
1556
|
+
|
|
1557
|
+
except ImportError:
|
|
1558
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1559
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1560
|
+
raise typer.Exit(1)
|
|
1561
|
+
except Exception as e:
|
|
1562
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1563
|
+
raise typer.Exit(1)
|
|
1564
|
+
|
|
1565
|
+
|
|
1566
|
+
@app.command()
|
|
1567
|
+
def fix(
|
|
1568
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1569
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1570
|
+
) -> None:
|
|
1571
|
+
"""Suggest data quality fixes using AI (AI-powered).
|
|
1572
|
+
|
|
1573
|
+
Requires: pip install duckguard[llm]
|
|
1574
|
+
"""
|
|
1575
|
+
try:
|
|
1576
|
+
from duckguard.ai import suggest_fixes
|
|
1577
|
+
from duckguard.connectors import connect as dg_connect
|
|
1578
|
+
|
|
1579
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1580
|
+
dataset = dg_connect(source, table=table)
|
|
1581
|
+
|
|
1582
|
+
with console.status("[bold green]Analyzing fixes with AI..."):
|
|
1583
|
+
result = suggest_fixes(dataset)
|
|
1584
|
+
|
|
1585
|
+
console.print()
|
|
1586
|
+
console.print(Panel(result, title="[bold]Suggested Fixes[/bold]", border_style="yellow"))
|
|
1587
|
+
|
|
1588
|
+
except ImportError:
|
|
1589
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1590
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1591
|
+
raise typer.Exit(1)
|
|
1592
|
+
except Exception as e:
|
|
1593
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1594
|
+
raise typer.Exit(1)
|
|
1595
|
+
|
|
1596
|
+
|
|
1210
1597
|
if __name__ == "__main__":
|
|
1211
1598
|
app()
|