duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/cli/main.py CHANGED
@@ -5,21 +5,14 @@ A modern, beautiful CLI for data quality that just works.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import sys
9
8
  from pathlib import Path
10
- from typing import Optional
11
9
 
12
10
  import typer
13
11
  from rich.console import Console
14
12
  from rich.panel import Panel
15
- from rich.table import Table
16
- from rich.syntax import Syntax
17
13
  from rich.progress import Progress, SpinnerColumn, TextColumn
18
- from rich import print as rprint
19
- from rich.tree import Tree
20
- from rich.text import Text
21
- from rich.columns import Columns
22
- from rich.markdown import Markdown
14
+ from rich.syntax import Syntax
15
+ from rich.table import Table
23
16
 
24
17
  from duckguard import __version__
25
18
 
@@ -45,7 +38,7 @@ def version_callback(value: bool) -> None:
45
38
 
46
39
  @app.callback()
47
40
  def main(
48
- version: Optional[bool] = typer.Option(
41
+ version: bool | None = typer.Option(
49
42
  None,
50
43
  "--version",
51
44
  "-v",
@@ -61,11 +54,11 @@ def main(
61
54
  @app.command()
62
55
  def check(
63
56
  source: str = typer.Argument(..., help="Path to file or connection string"),
64
- config: Optional[str] = typer.Option(None, "--config", "-c", help="Path to duckguard.yaml rules file"),
65
- table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
66
- not_null: Optional[list[str]] = typer.Option(None, "--not-null", "-n", help="Columns that must not be null"),
67
- unique: Optional[list[str]] = typer.Option(None, "--unique", "-u", help="Columns that must be unique"),
68
- output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file (json)"),
57
+ config: str | None = typer.Option(None, "--config", "-c", help="Path to duckguard.yaml rules file"),
58
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
59
+ not_null: list[str] | None = typer.Option(None, "--not-null", "-n", help="Columns that must not be null"),
60
+ unique: list[str] | None = typer.Option(None, "--unique", "-u", help="Columns that must be unique"),
61
+ output: str | None = typer.Option(None, "--output", "-o", help="Output file (json)"),
69
62
  verbose: bool = typer.Option(False, "--verbose", "-V", help="Verbose output"),
70
63
  ) -> None:
71
64
  """
@@ -78,8 +71,8 @@ def check(
78
71
  duckguard check postgres://localhost/db --table orders
79
72
  """
80
73
  from duckguard.connectors import connect
81
- from duckguard.rules import load_rules, execute_rules
82
74
  from duckguard.core.scoring import score
75
+ from duckguard.rules import execute_rules, load_rules
83
76
 
84
77
  console.print(f"\n[bold blue]DuckGuard[/bold blue] Checking: [cyan]{source}[/cyan]\n")
85
78
 
@@ -185,8 +178,8 @@ def check(
185
178
  @app.command()
186
179
  def discover(
187
180
  source: str = typer.Argument(..., help="Path to file or connection string"),
188
- table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
189
- output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file for rules (duckguard.yaml)"),
181
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
182
+ output: str | None = typer.Option(None, "--output", "-o", help="Output file for rules (duckguard.yaml)"),
190
183
  format: str = typer.Option("yaml", "--format", "-f", help="Output format: yaml, python"),
191
184
  ) -> None:
192
185
  """
@@ -213,7 +206,7 @@ def discover(
213
206
  console=console,
214
207
  transient=True,
215
208
  ) as progress:
216
- task = progress.add_task("Analyzing data...", total=None)
209
+ _task = progress.add_task("Analyzing data...", total=None) # noqa: F841
217
210
  dataset = connect(source, table=table)
218
211
 
219
212
  # Semantic analysis
@@ -230,7 +223,7 @@ def discover(
230
223
  if output:
231
224
  yaml_content = ruleset_to_yaml(ruleset)
232
225
  Path(output).write_text(yaml_content, encoding="utf-8")
233
- console.print(f"\n[green][/green] Rules saved to [cyan]{output}[/cyan]")
226
+ console.print(f"\n[green]SAVED[/green] Rules saved to [cyan]{output}[/cyan]")
234
227
  console.print(f"[dim]Run: duckguard check {source} --config {output}[/dim]")
235
228
  else:
236
229
  # Display YAML
@@ -250,8 +243,8 @@ def discover(
250
243
  def contract(
251
244
  action: str = typer.Argument(..., help="Action: generate, validate, diff"),
252
245
  source: str = typer.Argument(None, help="Data source or contract file"),
253
- contract_file: Optional[str] = typer.Option(None, "--contract", "-c", help="Contract file path"),
254
- output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file"),
246
+ contract_file: str | None = typer.Option(None, "--contract", "-c", help="Contract file path"),
247
+ output: str | None = typer.Option(None, "--output", "-o", help="Output file"),
255
248
  strict: bool = typer.Option(False, "--strict", help="Strict validation mode"),
256
249
  ) -> None:
257
250
  """
@@ -268,13 +261,12 @@ def contract(
268
261
  duckguard contract diff old.contract.yaml new.contract.yaml
269
262
  """
270
263
  from duckguard.contracts import (
264
+ diff_contracts,
265
+ generate_contract,
271
266
  load_contract,
272
267
  validate_contract,
273
- generate_contract,
274
- diff_contracts,
275
268
  )
276
269
  from duckguard.contracts.loader import contract_to_yaml
277
- from duckguard.connectors import connect
278
270
 
279
271
  try:
280
272
  if action == "generate":
@@ -298,14 +290,14 @@ def contract(
298
290
  if output:
299
291
  yaml_content = contract_to_yaml(contract_obj)
300
292
  Path(output).write_text(yaml_content, encoding="utf-8")
301
- console.print(f"\n[green][/green] Contract saved to [cyan]{output}[/cyan]")
293
+ console.print(f"\n[green]SAVED[/green] Contract saved to [cyan]{output}[/cyan]")
302
294
 
303
295
  elif action == "validate":
304
296
  if not source or not contract_file:
305
297
  console.print("[red]Error:[/red] Both source and --contract required for validate")
306
298
  raise typer.Exit(1)
307
299
 
308
- console.print(f"\n[bold blue]DuckGuard[/bold blue] Validating against contract\n")
300
+ console.print("\n[bold blue]DuckGuard[/bold blue] Validating against contract\n")
309
301
 
310
302
  with Progress(
311
303
  SpinnerColumn(),
@@ -345,21 +337,32 @@ def contract(
345
337
  @app.command()
346
338
  def anomaly(
347
339
  source: str = typer.Argument(..., help="Path to file or connection string"),
348
- table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
349
- method: str = typer.Option("zscore", "--method", "-m", help="Detection method: zscore, iqr, percent_change"),
350
- threshold: Optional[float] = typer.Option(None, "--threshold", help="Detection threshold"),
351
- columns: Optional[list[str]] = typer.Option(None, "--column", "-c", help="Specific columns to check"),
340
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
341
+ method: str = typer.Option("zscore", "--method", "-m", help="Method: zscore, iqr, percent_change, baseline, ks_test"),
342
+ threshold: float | None = typer.Option(None, "--threshold", help="Detection threshold"),
343
+ columns: list[str] | None = typer.Option(None, "--column", "-c", help="Specific columns to check"),
344
+ learn_baseline: bool = typer.Option(False, "--learn-baseline", "-L", help="Learn and store baseline from current data"),
352
345
  ) -> None:
353
346
  """
354
347
  Detect anomalies in data.
355
348
 
349
+ [bold]Methods:[/bold]
350
+ zscore - Z-score based detection (default)
351
+ iqr - Interquartile range detection
352
+ percent_change - Percent change from baseline
353
+ baseline - Compare to learned baseline (ML)
354
+ ks_test - Distribution drift detection (ML)
355
+
356
356
  [bold]Examples:[/bold]
357
357
  duckguard anomaly data.csv
358
358
  duckguard anomaly data.csv --method iqr --threshold 2.0
359
359
  duckguard anomaly data.csv --column amount --column quantity
360
+ duckguard anomaly data.csv --learn-baseline # Store baseline
361
+ duckguard anomaly data.csv --method baseline # Compare to baseline
362
+ duckguard anomaly data.csv --method ks_test # Detect drift
360
363
  """
361
- from duckguard.connectors import connect
362
364
  from duckguard.anomaly import detect_anomalies
365
+ from duckguard.connectors import connect
363
366
 
364
367
  console.print(f"\n[bold blue]DuckGuard[/bold blue] Detecting anomalies in: [cyan]{source}[/cyan]\n")
365
368
 
@@ -370,8 +373,38 @@ def anomaly(
370
373
  console=console,
371
374
  transient=True,
372
375
  ) as progress:
373
- progress.add_task("Analyzing data...", total=None)
376
+ if learn_baseline:
377
+ progress.add_task("Learning baseline...", total=None)
378
+ else:
379
+ progress.add_task("Analyzing data...", total=None)
380
+
374
381
  dataset = connect(source, table=table)
382
+
383
+ # Handle baseline learning
384
+ if learn_baseline:
385
+ from duckguard.anomaly import BaselineMethod
386
+ from duckguard.history import HistoryStorage
387
+
388
+ storage = HistoryStorage()
389
+ baseline_method = BaselineMethod(storage=storage)
390
+
391
+ # Get numeric columns to learn baselines for
392
+ target_columns = columns if columns else dataset.columns
393
+ learned = 0
394
+
395
+ for col_name in target_columns:
396
+ col = dataset[col_name]
397
+ if col.mean is not None: # Numeric column
398
+ values = col.values
399
+ baseline_method.fit(values)
400
+ baseline_method.save_baseline(source, col_name)
401
+ learned += 1
402
+
403
+ console.print(f"[green]LEARNED[/green] Baselines stored for {learned} columns")
404
+ console.print("[dim]Use --method baseline to compare against stored baselines[/dim]")
405
+ return
406
+
407
+ # Regular anomaly detection
375
408
  report = detect_anomalies(
376
409
  dataset,
377
410
  method=method,
@@ -392,7 +425,7 @@ def anomaly(
392
425
  @app.command()
393
426
  def info(
394
427
  source: str = typer.Argument(..., help="Path to file or connection string"),
395
- table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
428
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
396
429
  ) -> None:
397
430
  """
398
431
  Display information about a data source.
@@ -441,7 +474,7 @@ def info(
441
474
  if sem_type == "unknown":
442
475
  sem_type = "-"
443
476
  if col_analysis.is_pii:
444
- sem_type = f"🔒 {sem_type}"
477
+ sem_type = f"[PII] {sem_type}"
445
478
 
446
479
  col_table.add_row(
447
480
  col_name,
@@ -472,11 +505,11 @@ def _display_execution_result(result, verbose: bool = False) -> None:
472
505
 
473
506
  for check_result in result.results:
474
507
  if check_result.passed:
475
- status = "[green]PASS[/green]"
508
+ status = "[green]PASS[/green]"
476
509
  elif check_result.severity.value == "warning":
477
- status = "[yellow]WARN[/yellow]"
510
+ status = "[yellow]WARN[/yellow]"
478
511
  else:
479
- status = "[red]FAIL[/red]"
512
+ status = "[red]FAIL[/red]"
480
513
 
481
514
  col_str = f"[{check_result.column}] " if check_result.column else ""
482
515
  table.add_row(
@@ -490,10 +523,10 @@ def _display_execution_result(result, verbose: bool = False) -> None:
490
523
  # Summary
491
524
  console.print()
492
525
  if result.passed:
493
- console.print(f"[green]All {result.total_checks} checks passed[/green]")
526
+ console.print(f"[green]All {result.total_checks} checks passed[/green]")
494
527
  else:
495
528
  console.print(
496
- f"[red]{result.failed_count} failed[/red], "
529
+ f"[red]{result.failed_count} failed[/red], "
497
530
  f"[yellow]{result.warning_count} warnings[/yellow], "
498
531
  f"[green]{result.passed_count} passed[/green]"
499
532
  )
@@ -507,7 +540,7 @@ def _display_quick_results(results: list) -> None:
507
540
  table.add_column("Details")
508
541
 
509
542
  for check_name, passed, details, _ in results:
510
- status = "[green]PASS[/green]" if passed else "[red]FAIL[/red]"
543
+ status = "[green]PASS[/green]" if passed else "[red]FAIL[/red]"
511
544
  table.add_row(check_name, status, details)
512
545
 
513
546
  console.print(table)
@@ -534,8 +567,8 @@ def _display_discovery_results(analysis, ruleset) -> None:
534
567
  # PII warning
535
568
  if analysis.pii_columns:
536
569
  console.print(Panel(
537
- "[yellow]⚠️ PII Detected[/yellow]\n" +
538
- "\n".join(f" {col}" for col in analysis.pii_columns),
570
+ "[yellow]WARNING: PII Detected[/yellow]\n" +
571
+ "\n".join(f" - {col}" for col in analysis.pii_columns),
539
572
  border_style="yellow",
540
573
  ))
541
574
  console.print()
@@ -549,7 +582,7 @@ def _display_discovery_results(analysis, ruleset) -> None:
549
582
  for col in analysis.columns[:15]:
550
583
  sem = col.semantic_type.value
551
584
  if col.is_pii:
552
- sem = f"🔒 {sem}"
585
+ sem = f"[PII] {sem}"
553
586
 
554
587
  rules = ", ".join(col.suggested_validations[:3])
555
588
  if len(col.suggested_validations) > 3:
@@ -582,9 +615,9 @@ def _display_contract(contract) -> None:
582
615
  table.add_row(
583
616
  field_obj.name,
584
617
  type_str,
585
- "" if field_obj.required else "",
586
- "" if field_obj.unique else "",
587
- "🔒" if field_obj.pii else "",
618
+ "Y" if field_obj.required else "",
619
+ "Y" if field_obj.unique else "",
620
+ "[PII]" if field_obj.pii else "",
588
621
  )
589
622
 
590
623
  console.print(table)
@@ -593,14 +626,14 @@ def _display_contract(contract) -> None:
593
626
  if contract.quality:
594
627
  console.print("\n[bold]Quality SLA:[/bold]")
595
628
  if contract.quality.completeness:
596
- console.print(f" Completeness: {contract.quality.completeness}%")
629
+ console.print(f" - Completeness: {contract.quality.completeness}%")
597
630
  if contract.quality.row_count_min:
598
- console.print(f" Min rows: {contract.quality.row_count_min:,}")
631
+ console.print(f" - Min rows: {contract.quality.row_count_min:,}")
599
632
 
600
633
 
601
634
  def _display_contract_validation(result) -> None:
602
635
  """Display contract validation results."""
603
- status = "[green]PASSED[/green]" if result.passed else "[red]FAILED[/red]"
636
+ status = "[green]PASSED[/green]" if result.passed else "[red]FAILED[/red]"
604
637
  console.print(f"Contract: [bold]{result.contract.name}[/bold] v{result.contract.version}")
605
638
  console.print(f"Status: {status}\n")
606
639
 
@@ -627,7 +660,7 @@ def _display_contract_validation(result) -> None:
627
660
 
628
661
  def _display_contract_diff(diff) -> None:
629
662
  """Display contract diff."""
630
- console.print(f"[bold]Comparing contracts[/bold]")
663
+ console.print("[bold]Comparing contracts[/bold]")
631
664
  console.print(f" Old: v{diff.old_contract.version}")
632
665
  console.print(f" New: v{diff.new_contract.version}\n")
633
666
 
@@ -640,19 +673,19 @@ def _display_contract_diff(diff) -> None:
640
673
  if diff.breaking_changes:
641
674
  console.print("[red bold]Breaking Changes:[/red bold]")
642
675
  for change in diff.breaking_changes:
643
- console.print(f" {change.message}")
676
+ console.print(f" [red]X[/red] {change.message}")
644
677
  console.print()
645
678
 
646
679
  if diff.minor_changes:
647
680
  console.print("[yellow bold]Minor Changes:[/yellow bold]")
648
681
  for change in diff.minor_changes:
649
- console.print(f" ⚠️ {change.message}")
682
+ console.print(f" [yellow]![/yellow] {change.message}")
650
683
  console.print()
651
684
 
652
685
  if diff.non_breaking_changes:
653
686
  console.print("[dim]Non-breaking Changes:[/dim]")
654
687
  for change in diff.non_breaking_changes:
655
- console.print(f" {change.message}")
688
+ console.print(f" - {change.message}")
656
689
 
657
690
  console.print(f"\n[dim]Suggested version bump: {diff.suggest_version_bump()}[/dim]")
658
691
 
@@ -660,10 +693,10 @@ def _display_contract_diff(diff) -> None:
660
693
  def _display_anomaly_report(report) -> None:
661
694
  """Display anomaly detection report."""
662
695
  if not report.has_anomalies:
663
- console.print("[green]No anomalies detected[/green]")
696
+ console.print("[green]No anomalies detected[/green]")
664
697
  return
665
698
 
666
- console.print(f"[yellow bold]⚠️ {report.anomaly_count} anomalies detected[/yellow bold]\n")
699
+ console.print(f"[yellow bold]WARNING: {report.anomaly_count} anomalies detected[/yellow bold]\n")
667
700
 
668
701
  table = Table(title="Anomalies")
669
702
  table.add_column("Column", style="cyan")
@@ -702,5 +735,477 @@ def _save_results(output: str, dataset, results) -> None:
702
735
  Path(output).write_text(json.dumps(data, indent=2))
703
736
 
704
737
 
738
+ @app.command()
739
+ def history(
740
+ source: str | None = typer.Argument(None, help="Data source to query history for (optional)"),
741
+ last: str = typer.Option("30d", "--last", "-l", help="Time period: 7d, 30d, 90d"),
742
+ output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
743
+ trend: bool = typer.Option(False, "--trend", "-t", help="Show quality trend analysis"),
744
+ db_path: str | None = typer.Option(None, "--db", help="Path to history database"),
745
+ ) -> None:
746
+ """
747
+ Query historical validation results.
748
+
749
+ Shows past validation runs and quality score trends over time.
750
+
751
+ [bold]Examples:[/bold]
752
+ duckguard history # Show all recent runs
753
+ duckguard history data.csv # Show runs for specific source
754
+ duckguard history data.csv --last 7d # Last 7 days
755
+ duckguard history data.csv --trend # Show trend analysis
756
+ duckguard history --format json # Output as JSON
757
+ """
758
+ import json as json_module
759
+
760
+ from duckguard.history import HistoryStorage, TrendAnalyzer
761
+
762
+ try:
763
+ storage = HistoryStorage(db_path=db_path)
764
+
765
+ # Parse time period
766
+ days = int(last.rstrip("d"))
767
+
768
+ if trend and source:
769
+ # Show trend analysis
770
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Trend Analysis: [cyan]{source}[/cyan]\n")
771
+
772
+ analyzer = TrendAnalyzer(storage)
773
+ analysis = analyzer.analyze(source, days=days)
774
+
775
+ if analysis.total_runs == 0:
776
+ console.print("[yellow]No historical data found for this source.[/yellow]")
777
+ console.print("[dim]Run some validations first, then check history.[/dim]")
778
+ return
779
+
780
+ # Display trend summary
781
+ trend_color = {
782
+ "improving": "green",
783
+ "declining": "red",
784
+ "stable": "yellow",
785
+ }.get(analysis.score_trend, "white")
786
+
787
+ trend_symbol = {
788
+ "improving": "[+]",
789
+ "declining": "[-]",
790
+ "stable": "[=]",
791
+ }.get(analysis.score_trend, "[=]")
792
+
793
+ console.print(Panel(
794
+ f"[bold]Quality Trend: [{trend_color}]{trend_symbol} {analysis.score_trend.upper()}[/{trend_color}][/bold]\n\n"
795
+ f"Current Score: [cyan]{analysis.current_score:.1f}%[/cyan]\n"
796
+ f"Average Score: [cyan]{analysis.average_score:.1f}%[/cyan]\n"
797
+ f"Min/Max: [dim]{analysis.min_score:.1f}% - {analysis.max_score:.1f}%[/dim]\n"
798
+ f"Change: [{trend_color}]{analysis.trend_change:+.1f}%[/{trend_color}]\n"
799
+ f"Total Runs: [cyan]{analysis.total_runs}[/cyan]\n"
800
+ f"Pass Rate: [cyan]{analysis.pass_rate:.1f}%[/cyan]",
801
+ title=f"Last {days} Days",
802
+ border_style=trend_color,
803
+ ))
804
+
805
+ if analysis.anomalies:
806
+ console.print(f"\n[yellow]Anomalies detected on: {', '.join(analysis.anomalies)}[/yellow]")
807
+
808
+ # Show daily data if available
809
+ if analysis.daily_data and len(analysis.daily_data) <= 14:
810
+ console.print()
811
+ table = Table(title="Daily Quality Scores")
812
+ table.add_column("Date", style="cyan")
813
+ table.add_column("Score", justify="right")
814
+ table.add_column("Runs", justify="right")
815
+ table.add_column("Pass Rate", justify="right")
816
+
817
+ for day in analysis.daily_data:
818
+ pass_rate = (day.passed_count / day.run_count * 100) if day.run_count > 0 else 0
819
+ score_style = "green" if day.avg_score >= 80 else "yellow" if day.avg_score >= 60 else "red"
820
+ table.add_row(
821
+ day.date,
822
+ f"[{score_style}]{day.avg_score:.1f}%[/{score_style}]",
823
+ str(day.run_count),
824
+ f"{pass_rate:.0f}%",
825
+ )
826
+
827
+ console.print(table)
828
+
829
+ else:
830
+ # Show run history
831
+ if source:
832
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] History: [cyan]{source}[/cyan]\n")
833
+ runs = storage.get_runs(source, limit=20)
834
+ else:
835
+ console.print("\n[bold blue]DuckGuard[/bold blue] Recent Validation History\n")
836
+ runs = storage.get_runs(limit=20)
837
+
838
+ if not runs:
839
+ console.print("[yellow]No historical data found.[/yellow]")
840
+ console.print("[dim]Run some validations first, then check history.[/dim]")
841
+ return
842
+
843
+ if output_format == "json":
844
+ # JSON output
845
+ data = [
846
+ {
847
+ "run_id": run.run_id,
848
+ "source": run.source,
849
+ "started_at": run.started_at.isoformat(),
850
+ "quality_score": run.quality_score,
851
+ "passed": run.passed,
852
+ "total_checks": run.total_checks,
853
+ "passed_count": run.passed_count,
854
+ "failed_count": run.failed_count,
855
+ "warning_count": run.warning_count,
856
+ }
857
+ for run in runs
858
+ ]
859
+ console.print(json_module.dumps(data, indent=2))
860
+ else:
861
+ # Table output
862
+ table = Table(title=f"Validation Runs (Last {days} days)")
863
+ table.add_column("Date", style="cyan")
864
+ table.add_column("Source", style="dim", max_width=40)
865
+ table.add_column("Score", justify="right")
866
+ table.add_column("Status", justify="center")
867
+ table.add_column("Checks", justify="right")
868
+
869
+ for run in runs:
870
+ score_style = "green" if run.quality_score >= 80 else "yellow" if run.quality_score >= 60 else "red"
871
+ status = "[green]PASS[/green]" if run.passed else "[red]FAIL[/red]"
872
+
873
+ table.add_row(
874
+ run.started_at.strftime("%Y-%m-%d %H:%M"),
875
+ run.source[:40],
876
+ f"[{score_style}]{run.quality_score:.1f}%[/{score_style}]",
877
+ status,
878
+ f"{run.passed_count}/{run.total_checks}",
879
+ )
880
+
881
+ console.print(table)
882
+
883
+ # Show sources summary
884
+ sources = storage.get_sources()
885
+ if len(sources) > 1:
886
+ console.print(f"\n[dim]Tracked sources: {len(sources)}[/dim]")
887
+
888
+ except Exception as e:
889
+ console.print(f"[red]Error:[/red] {e}")
890
+ raise typer.Exit(1)
891
+
892
+
893
+ @app.command()
894
+ def report(
895
+ source: str = typer.Argument(..., help="Data source path or connection string"),
896
+ config: str | None = typer.Option(None, "--config", "-c", help="Path to duckguard.yaml rules file"),
897
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
898
+ output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, pdf"),
899
+ output: str = typer.Option("report.html", "--output", "-o", help="Output file path"),
900
+ title: str = typer.Option("DuckGuard Data Quality Report", "--title", help="Report title"),
901
+ include_passed: bool = typer.Option(True, "--include-passed/--no-passed", help="Include passed checks"),
902
+ store: bool = typer.Option(False, "--store", "-s", help="Store results in history"),
903
+ ) -> None:
904
+ """
905
+ Generate a data quality report (HTML or PDF).
906
+
907
+ Runs validation checks and generates a beautiful, shareable report.
908
+
909
+ [bold]Examples:[/bold]
910
+ duckguard report data.csv
911
+ duckguard report data.csv --format pdf --output report.pdf
912
+ duckguard report data.csv --config rules.yaml --title "Orders Quality"
913
+ duckguard report data.csv --store # Also save to history
914
+ """
915
+ from duckguard.connectors import connect
916
+ from duckguard.reports import generate_html_report, generate_pdf_report
917
+ from duckguard.rules import execute_rules, generate_rules, load_rules
918
+
919
+ # Determine output path based on format
920
+ if output == "report.html" and output_format == "pdf":
921
+ output = "report.pdf"
922
+
923
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Generating {output_format.upper()} report\n")
924
+
925
+ try:
926
+ with Progress(
927
+ SpinnerColumn(),
928
+ TextColumn("[progress.description]{task.description}"),
929
+ console=console,
930
+ transient=True,
931
+ ) as progress:
932
+ progress.add_task("Connecting to data source...", total=None)
933
+ dataset = connect(source, table=table)
934
+
935
+ console.print(f"[dim]Source: {source}[/dim]")
936
+ console.print(f"[dim]Rows: {dataset.row_count:,} | Columns: {dataset.column_count}[/dim]\n")
937
+
938
+ with Progress(
939
+ SpinnerColumn(),
940
+ TextColumn("[progress.description]{task.description}"),
941
+ console=console,
942
+ transient=True,
943
+ ) as progress:
944
+ progress.add_task("Running validation checks...", total=None)
945
+
946
+ if config:
947
+ ruleset = load_rules(config)
948
+ else:
949
+ ruleset = generate_rules(dataset, as_yaml=False)
950
+
951
+ result = execute_rules(ruleset, dataset=dataset)
952
+
953
+ # Store in history if requested
954
+ if store:
955
+ from duckguard.history import HistoryStorage
956
+
957
+ storage = HistoryStorage()
958
+ run_id = storage.store(result)
959
+ console.print(f"[dim]Stored in history: {run_id[:8]}...[/dim]\n")
960
+
961
+ # Display summary
962
+ status = "[green]PASSED[/green]" if result.passed else "[red]FAILED[/red]"
963
+ console.print(f"Validation: {status}")
964
+ console.print(f"Quality Score: [cyan]{result.quality_score:.1f}%[/cyan]")
965
+ console.print(f"Checks: {result.passed_count}/{result.total_checks} passed\n")
966
+
967
+ # Generate report
968
+ with Progress(
969
+ SpinnerColumn(),
970
+ TextColumn("[progress.description]{task.description}"),
971
+ console=console,
972
+ transient=True,
973
+ ) as progress:
974
+ progress.add_task(f"Generating {output_format.upper()} report...", total=None)
975
+
976
+ if output_format.lower() == "pdf":
977
+ generate_pdf_report(result, output, title=title, include_passed=include_passed)
978
+ else:
979
+ generate_html_report(result, output, title=title, include_passed=include_passed)
980
+
981
+ console.print(f"[green]SAVED[/green] Report saved to [cyan]{output}[/cyan]")
982
+ console.print("[dim]Open in browser to view the report[/dim]")
983
+
984
+ except ImportError as e:
985
+ if "weasyprint" in str(e).lower():
986
+ console.print("[red]Error:[/red] PDF generation requires weasyprint.")
987
+ console.print("[dim]Install with: pip install duckguard[reports][/dim]")
988
+ else:
989
+ console.print(f"[red]Error:[/red] {e}")
990
+ raise typer.Exit(1)
991
+ except Exception as e:
992
+ console.print(f"[red]Error:[/red] {e}")
993
+ raise typer.Exit(1)
994
+
995
+
996
+ @app.command()
997
+ def freshness(
998
+ source: str = typer.Argument(..., help="Data source path"),
999
+ column: str | None = typer.Option(None, "--column", "-c", help="Timestamp column to check"),
1000
+ max_age: str = typer.Option("24h", "--max-age", "-m", help="Maximum acceptable age: 1h, 6h, 24h, 7d"),
1001
+ output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
1002
+ ) -> None:
1003
+ """
1004
+ Check data freshness.
1005
+
1006
+ Monitors how recently data was updated using file modification time
1007
+ or timestamp columns.
1008
+
1009
+ [bold]Examples:[/bold]
1010
+ duckguard freshness data.csv
1011
+ duckguard freshness data.csv --max-age 6h
1012
+ duckguard freshness data.csv --column updated_at
1013
+ duckguard freshness data.csv --format json
1014
+ """
1015
+ import json as json_module
1016
+
1017
+ from duckguard.connectors import connect
1018
+ from duckguard.freshness import FreshnessMonitor
1019
+ from duckguard.freshness.monitor import parse_age_string
1020
+
1021
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Checking freshness: [cyan]{source}[/cyan]\n")
1022
+
1023
+ try:
1024
+ threshold = parse_age_string(max_age)
1025
+ monitor = FreshnessMonitor(threshold=threshold)
1026
+
1027
+ with Progress(
1028
+ SpinnerColumn(),
1029
+ TextColumn("[progress.description]{task.description}"),
1030
+ console=console,
1031
+ transient=True,
1032
+ ) as progress:
1033
+ progress.add_task("Checking freshness...", total=None)
1034
+
1035
+ if column:
1036
+ dataset = connect(source)
1037
+ result = monitor.check_column_timestamp(dataset, column)
1038
+ else:
1039
+ # Try file mtime first, fallback to dataset
1040
+ from pathlib import Path
1041
+ if Path(source).exists():
1042
+ result = monitor.check_file_mtime(source)
1043
+ else:
1044
+ dataset = connect(source)
1045
+ result = monitor.check(dataset)
1046
+
1047
+ if output_format == "json":
1048
+ console.print(json_module.dumps(result.to_dict(), indent=2))
1049
+ else:
1050
+ # Display table
1051
+ status_color = "green" if result.is_fresh else "red"
1052
+ status_text = "FRESH" if result.is_fresh else "STALE"
1053
+
1054
+ console.print(Panel(
1055
+ f"[bold {status_color}]{status_text}[/bold {status_color}]\n\n"
1056
+ f"Last Modified: [cyan]{result.last_modified.strftime('%Y-%m-%d %H:%M:%S') if result.last_modified else 'Unknown'}[/cyan]\n"
1057
+ f"Age: [cyan]{result.age_human}[/cyan]\n"
1058
+ f"Threshold: [dim]{max_age}[/dim]\n"
1059
+ f"Method: [dim]{result.method.value}[/dim]",
1060
+ title="Freshness Check",
1061
+ border_style=status_color,
1062
+ ))
1063
+
1064
+ if not result.is_fresh:
1065
+ raise typer.Exit(1)
1066
+
1067
+ except Exception as e:
1068
+ console.print(f"[red]Error:[/red] {e}")
1069
+ raise typer.Exit(1)
1070
+
1071
+
1072
+ @app.command()
1073
+ def schema(
1074
+ source: str = typer.Argument(..., help="Data source path"),
1075
+ action: str = typer.Option("show", "--action", "-a", help="Action: show, capture, history, changes"),
1076
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
1077
+ output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
1078
+ limit: int = typer.Option(10, "--limit", "-l", help="Number of results to show"),
1079
+ ) -> None:
1080
+ """
1081
+ Track schema evolution over time.
1082
+
1083
+ Captures schema snapshots and detects changes between versions.
1084
+
1085
+ [bold]Actions:[/bold]
1086
+ show - Show current schema
1087
+ capture - Capture a schema snapshot
1088
+ history - Show schema snapshot history
1089
+ changes - Detect changes from last snapshot
1090
+
1091
+ [bold]Examples:[/bold]
1092
+ duckguard schema data.csv # Show current schema
1093
+ duckguard schema data.csv --action capture # Capture snapshot
1094
+ duckguard schema data.csv --action history # View history
1095
+ duckguard schema data.csv --action changes # Detect changes
1096
+ """
1097
+ import json as json_module
1098
+
1099
+ from duckguard.connectors import connect
1100
+ from duckguard.schema_history import SchemaChangeAnalyzer, SchemaTracker
1101
+
1102
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Schema: [cyan]{source}[/cyan]\n")
1103
+
1104
+ try:
1105
+ dataset = connect(source, table=table)
1106
+ tracker = SchemaTracker()
1107
+ analyzer = SchemaChangeAnalyzer()
1108
+
1109
+ if action == "show":
1110
+ # Display current schema
1111
+ col_table = Table(title="Current Schema")
1112
+ col_table.add_column("Column", style="cyan")
1113
+ col_table.add_column("Type", style="magenta")
1114
+ col_table.add_column("Position", justify="right")
1115
+
1116
+ ref = dataset.engine.get_source_reference(dataset.source)
1117
+ result = dataset.engine.execute(f"DESCRIBE {ref}")
1118
+
1119
+ for i, row in enumerate(result.fetchall()):
1120
+ col_table.add_row(row[0], row[1], str(i))
1121
+
1122
+ console.print(col_table)
1123
+ console.print(f"\n[dim]Total columns: {dataset.column_count}[/dim]")
1124
+
1125
+ elif action == "capture":
1126
+ with Progress(
1127
+ SpinnerColumn(),
1128
+ TextColumn("[progress.description]{task.description}"),
1129
+ console=console,
1130
+ transient=True,
1131
+ ) as progress:
1132
+ progress.add_task("Capturing schema snapshot...", total=None)
1133
+ snapshot = tracker.capture(dataset)
1134
+
1135
+ console.print(f"[green]CAPTURED[/green] Schema snapshot: [cyan]{snapshot.snapshot_id[:8]}...[/cyan]")
1136
+ console.print(f"[dim]Columns: {snapshot.column_count} | Rows: {snapshot.row_count:,}[/dim]")
1137
+ console.print(f"[dim]Captured at: {snapshot.captured_at.strftime('%Y-%m-%d %H:%M:%S')}[/dim]")
1138
+
1139
+ elif action == "history":
1140
+ history = tracker.get_history(source, limit=limit)
1141
+
1142
+ if not history:
1143
+ console.print("[yellow]No schema history found for this source.[/yellow]")
1144
+ console.print("[dim]Use --action capture to create a snapshot first.[/dim]")
1145
+ return
1146
+
1147
+ if output_format == "json":
1148
+ data = [s.to_dict() for s in history]
1149
+ console.print(json_module.dumps(data, indent=2))
1150
+ else:
1151
+ table_obj = Table(title="Schema History")
1152
+ table_obj.add_column("Snapshot ID", style="cyan")
1153
+ table_obj.add_column("Captured At", style="dim")
1154
+ table_obj.add_column("Columns", justify="right")
1155
+ table_obj.add_column("Rows", justify="right")
1156
+
1157
+ for snapshot in history:
1158
+ table_obj.add_row(
1159
+ snapshot.snapshot_id[:8] + "...",
1160
+ snapshot.captured_at.strftime("%Y-%m-%d %H:%M"),
1161
+ str(snapshot.column_count),
1162
+ f"{snapshot.row_count:,}" if snapshot.row_count else "-",
1163
+ )
1164
+
1165
+ console.print(table_obj)
1166
+
1167
+ elif action == "changes":
1168
+ with Progress(
1169
+ SpinnerColumn(),
1170
+ TextColumn("[progress.description]{task.description}"),
1171
+ console=console,
1172
+ transient=True,
1173
+ ) as progress:
1174
+ progress.add_task("Detecting schema changes...", total=None)
1175
+ report = analyzer.detect_changes(dataset)
1176
+
1177
+ if not report.has_changes:
1178
+ console.print("[green]No schema changes detected[/green]")
1179
+ console.print(f"[dim]Snapshot captured: {report.current_snapshot.snapshot_id[:8]}...[/dim]")
1180
+ return
1181
+
1182
+ # Display changes
1183
+ console.print(f"[yellow bold]{len(report.changes)} schema changes detected[/yellow bold]\n")
1184
+
1185
+ if report.has_breaking_changes:
1186
+ console.print("[red bold]BREAKING CHANGES:[/red bold]")
1187
+ for change in report.breaking_changes:
1188
+ console.print(f" [red]X[/red] {change}")
1189
+ console.print()
1190
+
1191
+ non_breaking = report.non_breaking_changes
1192
+ if non_breaking:
1193
+ console.print("[dim]Non-breaking changes:[/dim]")
1194
+ for change in non_breaking:
1195
+ console.print(f" - {change}")
1196
+
1197
+ if report.has_breaking_changes:
1198
+ raise typer.Exit(1)
1199
+
1200
+ else:
1201
+ console.print(f"[red]Error:[/red] Unknown action: {action}")
1202
+ console.print("[dim]Valid actions: show, capture, history, changes[/dim]")
1203
+ raise typer.Exit(1)
1204
+
1205
+ except Exception as e:
1206
+ console.print(f"[red]Error:[/red] {e}")
1207
+ raise typer.Exit(1)
1208
+
1209
+
705
1210
  if __name__ == "__main__":
706
1211
  app()