duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. duckguard/__init__.py +1 -1
  2. duckguard/anomaly/__init__.py +28 -0
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/methods.py +16 -2
  5. duckguard/anomaly/ml_methods.py +724 -0
  6. duckguard/checks/__init__.py +26 -0
  7. duckguard/checks/conditional.py +796 -0
  8. duckguard/checks/distributional.py +524 -0
  9. duckguard/checks/multicolumn.py +726 -0
  10. duckguard/checks/query_based.py +643 -0
  11. duckguard/cli/main.py +257 -2
  12. duckguard/connectors/factory.py +30 -2
  13. duckguard/connectors/files.py +7 -3
  14. duckguard/core/column.py +851 -1
  15. duckguard/core/dataset.py +1035 -0
  16. duckguard/core/result.py +236 -0
  17. duckguard/freshness/__init__.py +33 -0
  18. duckguard/freshness/monitor.py +429 -0
  19. duckguard/history/schema.py +119 -1
  20. duckguard/notifications/__init__.py +20 -2
  21. duckguard/notifications/email.py +508 -0
  22. duckguard/profiler/distribution_analyzer.py +384 -0
  23. duckguard/profiler/outlier_detector.py +497 -0
  24. duckguard/profiler/pattern_matcher.py +301 -0
  25. duckguard/profiler/quality_scorer.py +445 -0
  26. duckguard/reports/html_reporter.py +1 -2
  27. duckguard/rules/executor.py +642 -0
  28. duckguard/rules/generator.py +4 -1
  29. duckguard/rules/schema.py +54 -0
  30. duckguard/schema_history/__init__.py +40 -0
  31. duckguard/schema_history/analyzer.py +414 -0
  32. duckguard/schema_history/tracker.py +288 -0
  33. duckguard/semantic/detector.py +17 -1
  34. duckguard-3.0.0.dist-info/METADATA +1072 -0
  35. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
  36. duckguard-2.2.0.dist-info/METADATA +0 -351
  37. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
  38. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
  39. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/cli/main.py CHANGED
@@ -338,17 +338,28 @@ def contract(
338
338
  def anomaly(
339
339
  source: str = typer.Argument(..., help="Path to file or connection string"),
340
340
  table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
341
- method: str = typer.Option("zscore", "--method", "-m", help="Detection method: zscore, iqr, percent_change"),
341
+ method: str = typer.Option("zscore", "--method", "-m", help="Method: zscore, iqr, percent_change, baseline, ks_test"),
342
342
  threshold: float | None = typer.Option(None, "--threshold", help="Detection threshold"),
343
343
  columns: list[str] | None = typer.Option(None, "--column", "-c", help="Specific columns to check"),
344
+ learn_baseline: bool = typer.Option(False, "--learn-baseline", "-L", help="Learn and store baseline from current data"),
344
345
  ) -> None:
345
346
  """
346
347
  Detect anomalies in data.
347
348
 
349
+ [bold]Methods:[/bold]
350
+ zscore - Z-score based detection (default)
351
+ iqr - Interquartile range detection
352
+ percent_change - Percent change from baseline
353
+ baseline - Compare to learned baseline (ML)
354
+ ks_test - Distribution drift detection (ML)
355
+
348
356
  [bold]Examples:[/bold]
349
357
  duckguard anomaly data.csv
350
358
  duckguard anomaly data.csv --method iqr --threshold 2.0
351
359
  duckguard anomaly data.csv --column amount --column quantity
360
+ duckguard anomaly data.csv --learn-baseline # Store baseline
361
+ duckguard anomaly data.csv --method baseline # Compare to baseline
362
+ duckguard anomaly data.csv --method ks_test # Detect drift
352
363
  """
353
364
  from duckguard.anomaly import detect_anomalies
354
365
  from duckguard.connectors import connect
@@ -362,8 +373,38 @@ def anomaly(
362
373
  console=console,
363
374
  transient=True,
364
375
  ) as progress:
365
- progress.add_task("Analyzing data...", total=None)
376
+ if learn_baseline:
377
+ progress.add_task("Learning baseline...", total=None)
378
+ else:
379
+ progress.add_task("Analyzing data...", total=None)
380
+
366
381
  dataset = connect(source, table=table)
382
+
383
+ # Handle baseline learning
384
+ if learn_baseline:
385
+ from duckguard.anomaly import BaselineMethod
386
+ from duckguard.history import HistoryStorage
387
+
388
+ storage = HistoryStorage()
389
+ baseline_method = BaselineMethod(storage=storage)
390
+
391
+ # Get numeric columns to learn baselines for
392
+ target_columns = columns if columns else dataset.columns
393
+ learned = 0
394
+
395
+ for col_name in target_columns:
396
+ col = dataset[col_name]
397
+ if col.mean is not None: # Numeric column
398
+ values = col.values
399
+ baseline_method.fit(values)
400
+ baseline_method.save_baseline(source, col_name)
401
+ learned += 1
402
+
403
+ console.print(f"[green]LEARNED[/green] Baselines stored for {learned} columns")
404
+ console.print("[dim]Use --method baseline to compare against stored baselines[/dim]")
405
+ return
406
+
407
+ # Regular anomaly detection
367
408
  report = detect_anomalies(
368
409
  dataset,
369
410
  method=method,
@@ -952,5 +993,219 @@ def report(
952
993
  raise typer.Exit(1)
953
994
 
954
995
 
996
+ @app.command()
997
+ def freshness(
998
+ source: str = typer.Argument(..., help="Data source path"),
999
+ column: str | None = typer.Option(None, "--column", "-c", help="Timestamp column to check"),
1000
+ max_age: str = typer.Option("24h", "--max-age", "-m", help="Maximum acceptable age: 1h, 6h, 24h, 7d"),
1001
+ output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
1002
+ ) -> None:
1003
+ """
1004
+ Check data freshness.
1005
+
1006
+ Monitors how recently data was updated using file modification time
1007
+ or timestamp columns.
1008
+
1009
+ [bold]Examples:[/bold]
1010
+ duckguard freshness data.csv
1011
+ duckguard freshness data.csv --max-age 6h
1012
+ duckguard freshness data.csv --column updated_at
1013
+ duckguard freshness data.csv --format json
1014
+ """
1015
+ import json as json_module
1016
+
1017
+ from duckguard.connectors import connect
1018
+ from duckguard.freshness import FreshnessMonitor
1019
+ from duckguard.freshness.monitor import parse_age_string
1020
+
1021
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Checking freshness: [cyan]{source}[/cyan]\n")
1022
+
1023
+ try:
1024
+ threshold = parse_age_string(max_age)
1025
+ monitor = FreshnessMonitor(threshold=threshold)
1026
+
1027
+ with Progress(
1028
+ SpinnerColumn(),
1029
+ TextColumn("[progress.description]{task.description}"),
1030
+ console=console,
1031
+ transient=True,
1032
+ ) as progress:
1033
+ progress.add_task("Checking freshness...", total=None)
1034
+
1035
+ if column:
1036
+ dataset = connect(source)
1037
+ result = monitor.check_column_timestamp(dataset, column)
1038
+ else:
1039
+ # Try file mtime first, fallback to dataset
1040
+ from pathlib import Path
1041
+ if Path(source).exists():
1042
+ result = monitor.check_file_mtime(source)
1043
+ else:
1044
+ dataset = connect(source)
1045
+ result = monitor.check(dataset)
1046
+
1047
+ if output_format == "json":
1048
+ console.print(json_module.dumps(result.to_dict(), indent=2))
1049
+ else:
1050
+ # Display table
1051
+ status_color = "green" if result.is_fresh else "red"
1052
+ status_text = "FRESH" if result.is_fresh else "STALE"
1053
+
1054
+ console.print(Panel(
1055
+ f"[bold {status_color}]{status_text}[/bold {status_color}]\n\n"
1056
+ f"Last Modified: [cyan]{result.last_modified.strftime('%Y-%m-%d %H:%M:%S') if result.last_modified else 'Unknown'}[/cyan]\n"
1057
+ f"Age: [cyan]{result.age_human}[/cyan]\n"
1058
+ f"Threshold: [dim]{max_age}[/dim]\n"
1059
+ f"Method: [dim]{result.method.value}[/dim]",
1060
+ title="Freshness Check",
1061
+ border_style=status_color,
1062
+ ))
1063
+
1064
+ if not result.is_fresh:
1065
+ raise typer.Exit(1)
1066
+
1067
+ except Exception as e:
1068
+ console.print(f"[red]Error:[/red] {e}")
1069
+ raise typer.Exit(1)
1070
+
1071
+
1072
+ @app.command()
1073
+ def schema(
1074
+ source: str = typer.Argument(..., help="Data source path"),
1075
+ action: str = typer.Option("show", "--action", "-a", help="Action: show, capture, history, changes"),
1076
+ table: str | None = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
1077
+ output_format: str = typer.Option("table", "--format", "-f", help="Output format: table, json"),
1078
+ limit: int = typer.Option(10, "--limit", "-l", help="Number of results to show"),
1079
+ ) -> None:
1080
+ """
1081
+ Track schema evolution over time.
1082
+
1083
+ Captures schema snapshots and detects changes between versions.
1084
+
1085
+ [bold]Actions:[/bold]
1086
+ show - Show current schema
1087
+ capture - Capture a schema snapshot
1088
+ history - Show schema snapshot history
1089
+ changes - Detect changes from last snapshot
1090
+
1091
+ [bold]Examples:[/bold]
1092
+ duckguard schema data.csv # Show current schema
1093
+ duckguard schema data.csv --action capture # Capture snapshot
1094
+ duckguard schema data.csv --action history # View history
1095
+ duckguard schema data.csv --action changes # Detect changes
1096
+ """
1097
+ import json as json_module
1098
+
1099
+ from duckguard.connectors import connect
1100
+ from duckguard.schema_history import SchemaChangeAnalyzer, SchemaTracker
1101
+
1102
+ console.print(f"\n[bold blue]DuckGuard[/bold blue] Schema: [cyan]{source}[/cyan]\n")
1103
+
1104
+ try:
1105
+ dataset = connect(source, table=table)
1106
+ tracker = SchemaTracker()
1107
+ analyzer = SchemaChangeAnalyzer()
1108
+
1109
+ if action == "show":
1110
+ # Display current schema
1111
+ col_table = Table(title="Current Schema")
1112
+ col_table.add_column("Column", style="cyan")
1113
+ col_table.add_column("Type", style="magenta")
1114
+ col_table.add_column("Position", justify="right")
1115
+
1116
+ ref = dataset.engine.get_source_reference(dataset.source)
1117
+ result = dataset.engine.execute(f"DESCRIBE {ref}")
1118
+
1119
+ for i, row in enumerate(result.fetchall()):
1120
+ col_table.add_row(row[0], row[1], str(i))
1121
+
1122
+ console.print(col_table)
1123
+ console.print(f"\n[dim]Total columns: {dataset.column_count}[/dim]")
1124
+
1125
+ elif action == "capture":
1126
+ with Progress(
1127
+ SpinnerColumn(),
1128
+ TextColumn("[progress.description]{task.description}"),
1129
+ console=console,
1130
+ transient=True,
1131
+ ) as progress:
1132
+ progress.add_task("Capturing schema snapshot...", total=None)
1133
+ snapshot = tracker.capture(dataset)
1134
+
1135
+ console.print(f"[green]CAPTURED[/green] Schema snapshot: [cyan]{snapshot.snapshot_id[:8]}...[/cyan]")
1136
+ console.print(f"[dim]Columns: {snapshot.column_count} | Rows: {snapshot.row_count:,}[/dim]")
1137
+ console.print(f"[dim]Captured at: {snapshot.captured_at.strftime('%Y-%m-%d %H:%M:%S')}[/dim]")
1138
+
1139
+ elif action == "history":
1140
+ history = tracker.get_history(source, limit=limit)
1141
+
1142
+ if not history:
1143
+ console.print("[yellow]No schema history found for this source.[/yellow]")
1144
+ console.print("[dim]Use --action capture to create a snapshot first.[/dim]")
1145
+ return
1146
+
1147
+ if output_format == "json":
1148
+ data = [s.to_dict() for s in history]
1149
+ console.print(json_module.dumps(data, indent=2))
1150
+ else:
1151
+ table_obj = Table(title="Schema History")
1152
+ table_obj.add_column("Snapshot ID", style="cyan")
1153
+ table_obj.add_column("Captured At", style="dim")
1154
+ table_obj.add_column("Columns", justify="right")
1155
+ table_obj.add_column("Rows", justify="right")
1156
+
1157
+ for snapshot in history:
1158
+ table_obj.add_row(
1159
+ snapshot.snapshot_id[:8] + "...",
1160
+ snapshot.captured_at.strftime("%Y-%m-%d %H:%M"),
1161
+ str(snapshot.column_count),
1162
+ f"{snapshot.row_count:,}" if snapshot.row_count else "-",
1163
+ )
1164
+
1165
+ console.print(table_obj)
1166
+
1167
+ elif action == "changes":
1168
+ with Progress(
1169
+ SpinnerColumn(),
1170
+ TextColumn("[progress.description]{task.description}"),
1171
+ console=console,
1172
+ transient=True,
1173
+ ) as progress:
1174
+ progress.add_task("Detecting schema changes...", total=None)
1175
+ report = analyzer.detect_changes(dataset)
1176
+
1177
+ if not report.has_changes:
1178
+ console.print("[green]No schema changes detected[/green]")
1179
+ console.print(f"[dim]Snapshot captured: {report.current_snapshot.snapshot_id[:8]}...[/dim]")
1180
+ return
1181
+
1182
+ # Display changes
1183
+ console.print(f"[yellow bold]{len(report.changes)} schema changes detected[/yellow bold]\n")
1184
+
1185
+ if report.has_breaking_changes:
1186
+ console.print("[red bold]BREAKING CHANGES:[/red bold]")
1187
+ for change in report.breaking_changes:
1188
+ console.print(f" [red]X[/red] {change}")
1189
+ console.print()
1190
+
1191
+ non_breaking = report.non_breaking_changes
1192
+ if non_breaking:
1193
+ console.print("[dim]Non-breaking changes:[/dim]")
1194
+ for change in non_breaking:
1195
+ console.print(f" - {change}")
1196
+
1197
+ if report.has_breaking_changes:
1198
+ raise typer.Exit(1)
1199
+
1200
+ else:
1201
+ console.print(f"[red]Error:[/red] Unknown action: {action}")
1202
+ console.print("[dim]Valid actions: show, capture, history, changes[/dim]")
1203
+ raise typer.Exit(1)
1204
+
1205
+ except Exception as e:
1206
+ console.print(f"[red]Error:[/red] {e}")
1207
+ raise typer.Exit(1)
1208
+
1209
+
955
1210
  if __name__ == "__main__":
956
1211
  app()
@@ -31,7 +31,7 @@ def register_connector(connector_class: type[Connector]) -> None:
31
31
 
32
32
 
33
33
  def connect(
34
- source: str,
34
+ source: Any,
35
35
  *,
36
36
  table: str | None = None,
37
37
  schema: str | None = None,
@@ -46,7 +46,7 @@ def connect(
46
46
  It automatically detects the source type and uses the appropriate connector.
47
47
 
48
48
  Args:
49
- source: Path to file, connection string, or URL
49
+ source: Path to file, connection string, URL, or DataFrame (pandas/polars/pyarrow)
50
50
  table: Table name (for database connections)
51
51
  schema: Schema name (for database connections)
52
52
  database: Database name (for database connections)
@@ -60,6 +60,9 @@ def connect(
60
60
  # Connect to a CSV file
61
61
  orders = connect("data/orders.csv")
62
62
 
63
+ # Connect to a DataFrame
64
+ orders = connect(df)
65
+
63
66
  # Connect to a Parquet file on S3
64
67
  orders = connect("s3://bucket/orders.parquet")
65
68
 
@@ -72,6 +75,23 @@ def connect(
72
75
  Raises:
73
76
  ValueError: If no connector can handle the source
74
77
  """
78
+ # Handle DataFrame sources (pandas, polars, pyarrow)
79
+ if not isinstance(source, str):
80
+ # Check if it's a DataFrame-like object
81
+ if hasattr(source, '__dataframe__') or hasattr(source, 'to_pandas') or \
82
+ (hasattr(source, 'shape') and hasattr(source, 'columns')):
83
+ # Register DataFrame with engine
84
+ if engine is None:
85
+ engine = DuckGuardEngine.get_instance()
86
+
87
+ # Generate a unique name for the DataFrame
88
+ import hashlib
89
+ import time
90
+ df_name = f"df_{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}"
91
+
92
+ engine.register_dataframe(df_name, source)
93
+ return Dataset(source=df_name, engine=engine, name="dataframe")
94
+
75
95
  config = ConnectionConfig(
76
96
  source=source,
77
97
  table=table,
@@ -99,6 +119,10 @@ def connect(
99
119
 
100
120
  def _is_database_connection(source: str) -> bool:
101
121
  """Check if source is a database connection string."""
122
+ # Only handle string sources
123
+ if not isinstance(source, str):
124
+ return False
125
+
102
126
  db_prefixes = (
103
127
  "postgres://",
104
128
  "postgresql://",
@@ -143,6 +167,10 @@ def _handle_database_connection(
143
167
  engine: DuckGuardEngine | None,
144
168
  ) -> Dataset:
145
169
  """Handle database connection strings."""
170
+ # Validate source is a string
171
+ if not isinstance(source, str):
172
+ raise ValueError(f"Expected string source, got {type(source).__name__}")
173
+
146
174
  source_lower = source.lower()
147
175
 
148
176
  # PostgreSQL
@@ -65,6 +65,10 @@ class FileConnector(Connector):
65
65
  @classmethod
66
66
  def can_handle(cls, source: str) -> bool:
67
67
  """Check if this connector can handle the source."""
68
+ # Only handle string paths
69
+ if not isinstance(source, str):
70
+ return False
71
+
68
72
  # Check for file extensions
69
73
  path = Path(source)
70
74
  ext = path.suffix.lower()
@@ -99,7 +103,7 @@ class S3Connector(FileConnector):
99
103
  @classmethod
100
104
  def can_handle(cls, source: str) -> bool:
101
105
  """Check if this is an S3 path."""
102
- return source.lower().startswith("s3://")
106
+ return isinstance(source, str) and source.lower().startswith("s3://")
103
107
 
104
108
  @classmethod
105
109
  def get_priority(cls) -> int:
@@ -113,7 +117,7 @@ class GCSConnector(FileConnector):
113
117
  @classmethod
114
118
  def can_handle(cls, source: str) -> bool:
115
119
  """Check if this is a GCS path."""
116
- return source.lower().startswith(("gs://", "gcs://"))
120
+ return isinstance(source, str) and source.lower().startswith(("gs://", "gcs://"))
117
121
 
118
122
  @classmethod
119
123
  def get_priority(cls) -> int:
@@ -127,7 +131,7 @@ class AzureConnector(FileConnector):
127
131
  @classmethod
128
132
  def can_handle(cls, source: str) -> bool:
129
133
  """Check if this is an Azure path."""
130
- return source.lower().startswith(("az://", "abfs://"))
134
+ return isinstance(source, str) and source.lower().startswith(("az://", "abfs://"))
131
135
 
132
136
  @classmethod
133
137
  def get_priority(cls) -> int: