duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
duckguard/cli/main.py
ADDED
|
@@ -0,0 +1,706 @@
|
|
|
1
|
+
"""DuckGuard CLI - Command line interface for data quality validation.
|
|
2
|
+
|
|
3
|
+
A modern, beautiful CLI for data quality that just works.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.panel import Panel
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
from rich.syntax import Syntax
|
|
17
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
18
|
+
from rich import print as rprint
|
|
19
|
+
from rich.tree import Tree
|
|
20
|
+
from rich.text import Text
|
|
21
|
+
from rich.columns import Columns
|
|
22
|
+
from rich.markdown import Markdown
|
|
23
|
+
|
|
24
|
+
from duckguard import __version__
|
|
25
|
+
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
name="duckguard",
|
|
28
|
+
help="DuckGuard - Data quality that just works. Fast, simple, Pythonic.",
|
|
29
|
+
add_completion=False,
|
|
30
|
+
rich_markup_mode="rich",
|
|
31
|
+
)
|
|
32
|
+
console = Console()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def version_callback(value: bool) -> None:
|
|
36
|
+
"""Print version and exit."""
|
|
37
|
+
if value:
|
|
38
|
+
console.print(Panel(
|
|
39
|
+
f"[bold blue]DuckGuard[/bold blue] v{__version__}\n"
|
|
40
|
+
"[dim]The fast, simple data quality tool[/dim]",
|
|
41
|
+
border_style="blue"
|
|
42
|
+
))
|
|
43
|
+
raise typer.Exit()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.callback()
|
|
47
|
+
def main(
|
|
48
|
+
version: Optional[bool] = typer.Option(
|
|
49
|
+
None,
|
|
50
|
+
"--version",
|
|
51
|
+
"-v",
|
|
52
|
+
callback=version_callback,
|
|
53
|
+
is_eager=True,
|
|
54
|
+
help="Show version and exit.",
|
|
55
|
+
),
|
|
56
|
+
) -> None:
|
|
57
|
+
"""DuckGuard - Data quality made clear."""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@app.command()
|
|
62
|
+
def check(
|
|
63
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
64
|
+
config: Optional[str] = typer.Option(None, "--config", "-c", help="Path to duckguard.yaml rules file"),
|
|
65
|
+
table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name (for databases)"),
|
|
66
|
+
not_null: Optional[list[str]] = typer.Option(None, "--not-null", "-n", help="Columns that must not be null"),
|
|
67
|
+
unique: Optional[list[str]] = typer.Option(None, "--unique", "-u", help="Columns that must be unique"),
|
|
68
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file (json)"),
|
|
69
|
+
verbose: bool = typer.Option(False, "--verbose", "-V", help="Verbose output"),
|
|
70
|
+
) -> None:
|
|
71
|
+
"""
|
|
72
|
+
Run data quality checks on a data source.
|
|
73
|
+
|
|
74
|
+
[bold]Examples:[/bold]
|
|
75
|
+
duckguard check data.csv
|
|
76
|
+
duckguard check data.csv --not-null id --unique email
|
|
77
|
+
duckguard check data.csv --config duckguard.yaml
|
|
78
|
+
duckguard check postgres://localhost/db --table orders
|
|
79
|
+
"""
|
|
80
|
+
from duckguard.connectors import connect
|
|
81
|
+
from duckguard.rules import load_rules, execute_rules
|
|
82
|
+
from duckguard.core.scoring import score
|
|
83
|
+
|
|
84
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Checking: [cyan]{source}[/cyan]\n")
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
with Progress(
|
|
88
|
+
SpinnerColumn(),
|
|
89
|
+
TextColumn("[progress.description]{task.description}"),
|
|
90
|
+
console=console,
|
|
91
|
+
transient=True,
|
|
92
|
+
) as progress:
|
|
93
|
+
progress.add_task("Connecting to data source...", total=None)
|
|
94
|
+
dataset = connect(source, table=table)
|
|
95
|
+
|
|
96
|
+
# Display basic info
|
|
97
|
+
info_table = Table(show_header=False, box=None, padding=(0, 2))
|
|
98
|
+
info_table.add_column("", style="dim")
|
|
99
|
+
info_table.add_column("")
|
|
100
|
+
info_table.add_row("Rows", f"[green]{dataset.row_count:,}[/green]")
|
|
101
|
+
info_table.add_row("Columns", f"[green]{dataset.column_count}[/green]")
|
|
102
|
+
console.print(info_table)
|
|
103
|
+
console.print()
|
|
104
|
+
|
|
105
|
+
# Execute checks
|
|
106
|
+
if config:
|
|
107
|
+
# Use YAML rules
|
|
108
|
+
with Progress(
|
|
109
|
+
SpinnerColumn(),
|
|
110
|
+
TextColumn("[progress.description]{task.description}"),
|
|
111
|
+
console=console,
|
|
112
|
+
transient=True,
|
|
113
|
+
) as progress:
|
|
114
|
+
progress.add_task("Running checks...", total=None)
|
|
115
|
+
ruleset = load_rules(config)
|
|
116
|
+
result = execute_rules(ruleset, dataset=dataset)
|
|
117
|
+
|
|
118
|
+
_display_execution_result(result, verbose)
|
|
119
|
+
|
|
120
|
+
else:
|
|
121
|
+
# Quick checks from CLI arguments
|
|
122
|
+
results = []
|
|
123
|
+
|
|
124
|
+
# Row count check
|
|
125
|
+
results.append(("Row count > 0", dataset.row_count > 0, f"{dataset.row_count:,} rows", None))
|
|
126
|
+
|
|
127
|
+
# Not null checks
|
|
128
|
+
if not_null:
|
|
129
|
+
for col_name in not_null:
|
|
130
|
+
if col_name in dataset.columns:
|
|
131
|
+
col = dataset[col_name]
|
|
132
|
+
passed = col.null_count == 0
|
|
133
|
+
results.append((
|
|
134
|
+
f"{col_name} not null",
|
|
135
|
+
passed,
|
|
136
|
+
f"{col.null_count:,} nulls ({col.null_percent:.1f}%)",
|
|
137
|
+
col_name
|
|
138
|
+
))
|
|
139
|
+
else:
|
|
140
|
+
results.append((f"{col_name} not null", False, "Column not found", col_name))
|
|
141
|
+
|
|
142
|
+
# Unique checks
|
|
143
|
+
if unique:
|
|
144
|
+
for col_name in unique:
|
|
145
|
+
if col_name in dataset.columns:
|
|
146
|
+
col = dataset[col_name]
|
|
147
|
+
passed = col.unique_percent == 100
|
|
148
|
+
dup_count = col.total_count - col.unique_count
|
|
149
|
+
results.append((
|
|
150
|
+
f"{col_name} unique",
|
|
151
|
+
passed,
|
|
152
|
+
f"{col.unique_percent:.1f}% unique ({dup_count:,} duplicates)",
|
|
153
|
+
col_name
|
|
154
|
+
))
|
|
155
|
+
else:
|
|
156
|
+
results.append((f"{col_name} unique", False, "Column not found", col_name))
|
|
157
|
+
|
|
158
|
+
_display_quick_results(results)
|
|
159
|
+
|
|
160
|
+
# Calculate quality score
|
|
161
|
+
quality = score(dataset)
|
|
162
|
+
_display_quality_score(quality)
|
|
163
|
+
|
|
164
|
+
# Output to file
|
|
165
|
+
if output:
|
|
166
|
+
_save_results(output, dataset, results if not config else None)
|
|
167
|
+
console.print(f"\n[dim]Results saved to {output}[/dim]")
|
|
168
|
+
|
|
169
|
+
# Exit with error if any checks failed
|
|
170
|
+
if config and not result.passed:
|
|
171
|
+
raise typer.Exit(1)
|
|
172
|
+
elif not config and not all(r[1] for r in results):
|
|
173
|
+
raise typer.Exit(1)
|
|
174
|
+
|
|
175
|
+
except FileNotFoundError as e:
|
|
176
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
177
|
+
raise typer.Exit(1)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
180
|
+
if verbose:
|
|
181
|
+
console.print_exception()
|
|
182
|
+
raise typer.Exit(1)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@app.command()
|
|
186
|
+
def discover(
|
|
187
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
188
|
+
table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
|
|
189
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file for rules (duckguard.yaml)"),
|
|
190
|
+
format: str = typer.Option("yaml", "--format", "-f", help="Output format: yaml, python"),
|
|
191
|
+
) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Discover data and auto-generate validation rules.
|
|
194
|
+
|
|
195
|
+
Analyzes your data and suggests appropriate validation rules.
|
|
196
|
+
|
|
197
|
+
[bold]Examples:[/bold]
|
|
198
|
+
duckguard discover data.csv
|
|
199
|
+
duckguard discover data.csv --output duckguard.yaml
|
|
200
|
+
duckguard discover postgres://localhost/db --table users
|
|
201
|
+
"""
|
|
202
|
+
from duckguard.connectors import connect
|
|
203
|
+
from duckguard.rules import generate_rules
|
|
204
|
+
from duckguard.rules.generator import ruleset_to_yaml
|
|
205
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
206
|
+
|
|
207
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Discovering: [cyan]{source}[/cyan]\n")
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
with Progress(
|
|
211
|
+
SpinnerColumn(),
|
|
212
|
+
TextColumn("[progress.description]{task.description}"),
|
|
213
|
+
console=console,
|
|
214
|
+
transient=True,
|
|
215
|
+
) as progress:
|
|
216
|
+
task = progress.add_task("Analyzing data...", total=None)
|
|
217
|
+
dataset = connect(source, table=table)
|
|
218
|
+
|
|
219
|
+
# Semantic analysis
|
|
220
|
+
analyzer = SemanticAnalyzer()
|
|
221
|
+
analysis = analyzer.analyze(dataset)
|
|
222
|
+
|
|
223
|
+
# Generate rules (as RuleSet object, not YAML string)
|
|
224
|
+
ruleset = generate_rules(dataset, as_yaml=False)
|
|
225
|
+
|
|
226
|
+
# Display discovery results
|
|
227
|
+
_display_discovery_results(analysis, ruleset)
|
|
228
|
+
|
|
229
|
+
# Output
|
|
230
|
+
if output:
|
|
231
|
+
yaml_content = ruleset_to_yaml(ruleset)
|
|
232
|
+
Path(output).write_text(yaml_content, encoding="utf-8")
|
|
233
|
+
console.print(f"\n[green]✓[/green] Rules saved to [cyan]{output}[/cyan]")
|
|
234
|
+
console.print(f"[dim]Run: duckguard check {source} --config {output}[/dim]")
|
|
235
|
+
else:
|
|
236
|
+
# Display YAML
|
|
237
|
+
yaml_content = ruleset_to_yaml(ruleset)
|
|
238
|
+
console.print(Panel(
|
|
239
|
+
Syntax(yaml_content, "yaml", theme="monokai"),
|
|
240
|
+
title="Generated Rules (duckguard.yaml)",
|
|
241
|
+
border_style="green"
|
|
242
|
+
))
|
|
243
|
+
|
|
244
|
+
except Exception as e:
|
|
245
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
246
|
+
raise typer.Exit(1)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
@app.command()
|
|
250
|
+
def contract(
|
|
251
|
+
action: str = typer.Argument(..., help="Action: generate, validate, diff"),
|
|
252
|
+
source: str = typer.Argument(None, help="Data source or contract file"),
|
|
253
|
+
contract_file: Optional[str] = typer.Option(None, "--contract", "-c", help="Contract file path"),
|
|
254
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="Output file"),
|
|
255
|
+
strict: bool = typer.Option(False, "--strict", help="Strict validation mode"),
|
|
256
|
+
) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Manage data contracts.
|
|
259
|
+
|
|
260
|
+
[bold]Actions:[/bold]
|
|
261
|
+
generate - Generate a contract from data
|
|
262
|
+
validate - Validate data against a contract
|
|
263
|
+
diff - Compare two contract versions
|
|
264
|
+
|
|
265
|
+
[bold]Examples:[/bold]
|
|
266
|
+
duckguard contract generate data.csv --output orders.contract.yaml
|
|
267
|
+
duckguard contract validate data.csv --contract orders.contract.yaml
|
|
268
|
+
duckguard contract diff old.contract.yaml new.contract.yaml
|
|
269
|
+
"""
|
|
270
|
+
from duckguard.contracts import (
|
|
271
|
+
load_contract,
|
|
272
|
+
validate_contract,
|
|
273
|
+
generate_contract,
|
|
274
|
+
diff_contracts,
|
|
275
|
+
)
|
|
276
|
+
from duckguard.contracts.loader import contract_to_yaml
|
|
277
|
+
from duckguard.connectors import connect
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
if action == "generate":
|
|
281
|
+
if not source:
|
|
282
|
+
console.print("[red]Error:[/red] Source required for generate")
|
|
283
|
+
raise typer.Exit(1)
|
|
284
|
+
|
|
285
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Generating contract for: [cyan]{source}[/cyan]\n")
|
|
286
|
+
|
|
287
|
+
with Progress(
|
|
288
|
+
SpinnerColumn(),
|
|
289
|
+
TextColumn("[progress.description]{task.description}"),
|
|
290
|
+
console=console,
|
|
291
|
+
transient=True,
|
|
292
|
+
) as progress:
|
|
293
|
+
progress.add_task("Analyzing data...", total=None)
|
|
294
|
+
contract_obj = generate_contract(source)
|
|
295
|
+
|
|
296
|
+
_display_contract(contract_obj)
|
|
297
|
+
|
|
298
|
+
if output:
|
|
299
|
+
yaml_content = contract_to_yaml(contract_obj)
|
|
300
|
+
Path(output).write_text(yaml_content, encoding="utf-8")
|
|
301
|
+
console.print(f"\n[green]✓[/green] Contract saved to [cyan]{output}[/cyan]")
|
|
302
|
+
|
|
303
|
+
elif action == "validate":
|
|
304
|
+
if not source or not contract_file:
|
|
305
|
+
console.print("[red]Error:[/red] Both source and --contract required for validate")
|
|
306
|
+
raise typer.Exit(1)
|
|
307
|
+
|
|
308
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Validating against contract\n")
|
|
309
|
+
|
|
310
|
+
with Progress(
|
|
311
|
+
SpinnerColumn(),
|
|
312
|
+
TextColumn("[progress.description]{task.description}"),
|
|
313
|
+
console=console,
|
|
314
|
+
transient=True,
|
|
315
|
+
) as progress:
|
|
316
|
+
progress.add_task("Validating...", total=None)
|
|
317
|
+
contract_obj = load_contract(contract_file)
|
|
318
|
+
result = validate_contract(contract_obj, source, strict_mode=strict)
|
|
319
|
+
|
|
320
|
+
_display_contract_validation(result)
|
|
321
|
+
|
|
322
|
+
if not result.passed:
|
|
323
|
+
raise typer.Exit(1)
|
|
324
|
+
|
|
325
|
+
elif action == "diff":
|
|
326
|
+
if not source or not contract_file:
|
|
327
|
+
console.print("[red]Error:[/red] Two contract files required for diff")
|
|
328
|
+
raise typer.Exit(1)
|
|
329
|
+
|
|
330
|
+
old_contract = load_contract(source)
|
|
331
|
+
new_contract = load_contract(contract_file)
|
|
332
|
+
|
|
333
|
+
diff_result = diff_contracts(old_contract, new_contract)
|
|
334
|
+
_display_contract_diff(diff_result)
|
|
335
|
+
|
|
336
|
+
else:
|
|
337
|
+
console.print(f"[red]Error:[/red] Unknown action: {action}")
|
|
338
|
+
raise typer.Exit(1)
|
|
339
|
+
|
|
340
|
+
except Exception as e:
|
|
341
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
342
|
+
raise typer.Exit(1)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
@app.command()
|
|
346
|
+
def anomaly(
|
|
347
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
348
|
+
table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
|
|
349
|
+
method: str = typer.Option("zscore", "--method", "-m", help="Detection method: zscore, iqr, percent_change"),
|
|
350
|
+
threshold: Optional[float] = typer.Option(None, "--threshold", help="Detection threshold"),
|
|
351
|
+
columns: Optional[list[str]] = typer.Option(None, "--column", "-c", help="Specific columns to check"),
|
|
352
|
+
) -> None:
|
|
353
|
+
"""
|
|
354
|
+
Detect anomalies in data.
|
|
355
|
+
|
|
356
|
+
[bold]Examples:[/bold]
|
|
357
|
+
duckguard anomaly data.csv
|
|
358
|
+
duckguard anomaly data.csv --method iqr --threshold 2.0
|
|
359
|
+
duckguard anomaly data.csv --column amount --column quantity
|
|
360
|
+
"""
|
|
361
|
+
from duckguard.connectors import connect
|
|
362
|
+
from duckguard.anomaly import detect_anomalies
|
|
363
|
+
|
|
364
|
+
console.print(f"\n[bold blue]DuckGuard[/bold blue] Detecting anomalies in: [cyan]{source}[/cyan]\n")
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
with Progress(
|
|
368
|
+
SpinnerColumn(),
|
|
369
|
+
TextColumn("[progress.description]{task.description}"),
|
|
370
|
+
console=console,
|
|
371
|
+
transient=True,
|
|
372
|
+
) as progress:
|
|
373
|
+
progress.add_task("Analyzing data...", total=None)
|
|
374
|
+
dataset = connect(source, table=table)
|
|
375
|
+
report = detect_anomalies(
|
|
376
|
+
dataset,
|
|
377
|
+
method=method,
|
|
378
|
+
threshold=threshold,
|
|
379
|
+
columns=columns,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
_display_anomaly_report(report)
|
|
383
|
+
|
|
384
|
+
if report.has_anomalies:
|
|
385
|
+
raise typer.Exit(1)
|
|
386
|
+
|
|
387
|
+
except Exception as e:
|
|
388
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
389
|
+
raise typer.Exit(1)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@app.command()
|
|
393
|
+
def info(
|
|
394
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
395
|
+
table: Optional[str] = typer.Option(None, "--table", "-t", help="Table name"),
|
|
396
|
+
) -> None:
|
|
397
|
+
"""
|
|
398
|
+
Display information about a data source.
|
|
399
|
+
|
|
400
|
+
[bold]Examples:[/bold]
|
|
401
|
+
duckguard info data.csv
|
|
402
|
+
duckguard info postgres://localhost/db --table users
|
|
403
|
+
"""
|
|
404
|
+
from duckguard.connectors import connect
|
|
405
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
dataset = connect(source, table=table)
|
|
409
|
+
analyzer = SemanticAnalyzer()
|
|
410
|
+
|
|
411
|
+
console.print(Panel(
|
|
412
|
+
f"[bold]{dataset.name}[/bold]",
|
|
413
|
+
border_style="blue"
|
|
414
|
+
))
|
|
415
|
+
|
|
416
|
+
# Basic info
|
|
417
|
+
info_table = Table(show_header=False, box=None)
|
|
418
|
+
info_table.add_column("Property", style="cyan")
|
|
419
|
+
info_table.add_column("Value", style="green")
|
|
420
|
+
|
|
421
|
+
info_table.add_row("Source", source)
|
|
422
|
+
info_table.add_row("Rows", f"{dataset.row_count:,}")
|
|
423
|
+
info_table.add_row("Columns", str(dataset.column_count))
|
|
424
|
+
|
|
425
|
+
console.print(info_table)
|
|
426
|
+
console.print()
|
|
427
|
+
|
|
428
|
+
# Column details
|
|
429
|
+
col_table = Table(title="Columns")
|
|
430
|
+
col_table.add_column("Name", style="cyan")
|
|
431
|
+
col_table.add_column("Type", style="magenta")
|
|
432
|
+
col_table.add_column("Nulls", justify="right")
|
|
433
|
+
col_table.add_column("Unique", justify="right")
|
|
434
|
+
col_table.add_column("Semantic", style="yellow")
|
|
435
|
+
|
|
436
|
+
for col_name in dataset.columns[:20]:
|
|
437
|
+
col = dataset[col_name]
|
|
438
|
+
col_analysis = analyzer.analyze_column(dataset, col_name)
|
|
439
|
+
|
|
440
|
+
sem_type = col_analysis.semantic_type.value
|
|
441
|
+
if sem_type == "unknown":
|
|
442
|
+
sem_type = "-"
|
|
443
|
+
if col_analysis.is_pii:
|
|
444
|
+
sem_type = f"🔒 {sem_type}"
|
|
445
|
+
|
|
446
|
+
col_table.add_row(
|
|
447
|
+
col_name,
|
|
448
|
+
"numeric" if col.mean is not None else "string",
|
|
449
|
+
f"{col.null_percent:.1f}%",
|
|
450
|
+
f"{col.unique_percent:.1f}%",
|
|
451
|
+
sem_type,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
if dataset.column_count > 20:
|
|
455
|
+
col_table.add_row(f"... and {dataset.column_count - 20} more", "", "", "", "")
|
|
456
|
+
|
|
457
|
+
console.print(col_table)
|
|
458
|
+
|
|
459
|
+
except Exception as e:
|
|
460
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
461
|
+
raise typer.Exit(1)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
# Helper display functions
|
|
465
|
+
|
|
466
|
+
def _display_execution_result(result, verbose: bool = False) -> None:
|
|
467
|
+
"""Display rule execution results."""
|
|
468
|
+
table = Table(title="Validation Results")
|
|
469
|
+
table.add_column("Check", style="cyan")
|
|
470
|
+
table.add_column("Status", justify="center")
|
|
471
|
+
table.add_column("Details")
|
|
472
|
+
|
|
473
|
+
for check_result in result.results:
|
|
474
|
+
if check_result.passed:
|
|
475
|
+
status = "[green]✓ PASS[/green]"
|
|
476
|
+
elif check_result.severity.value == "warning":
|
|
477
|
+
status = "[yellow]⚠ WARN[/yellow]"
|
|
478
|
+
else:
|
|
479
|
+
status = "[red]✗ FAIL[/red]"
|
|
480
|
+
|
|
481
|
+
col_str = f"[{check_result.column}] " if check_result.column else ""
|
|
482
|
+
table.add_row(
|
|
483
|
+
f"{col_str}{check_result.check.type.value}",
|
|
484
|
+
status,
|
|
485
|
+
check_result.message[:60],
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
console.print(table)
|
|
489
|
+
|
|
490
|
+
# Summary
|
|
491
|
+
console.print()
|
|
492
|
+
if result.passed:
|
|
493
|
+
console.print(f"[green]✓ All {result.total_checks} checks passed[/green]")
|
|
494
|
+
else:
|
|
495
|
+
console.print(
|
|
496
|
+
f"[red]✗ {result.failed_count} failed[/red], "
|
|
497
|
+
f"[yellow]{result.warning_count} warnings[/yellow], "
|
|
498
|
+
f"[green]{result.passed_count} passed[/green]"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _display_quick_results(results: list) -> None:
|
|
503
|
+
"""Display quick check results."""
|
|
504
|
+
table = Table()
|
|
505
|
+
table.add_column("Check", style="cyan")
|
|
506
|
+
table.add_column("Status", justify="center")
|
|
507
|
+
table.add_column("Details")
|
|
508
|
+
|
|
509
|
+
for check_name, passed, details, _ in results:
|
|
510
|
+
status = "[green]✓ PASS[/green]" if passed else "[red]✗ FAIL[/red]"
|
|
511
|
+
table.add_row(check_name, status, details)
|
|
512
|
+
|
|
513
|
+
console.print(table)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def _display_quality_score(quality) -> None:
|
|
517
|
+
"""Display quality score."""
|
|
518
|
+
grade_colors = {"A": "green", "B": "blue", "C": "yellow", "D": "orange1", "F": "red"}
|
|
519
|
+
color = grade_colors.get(quality.grade, "white")
|
|
520
|
+
|
|
521
|
+
console.print()
|
|
522
|
+
console.print(Panel(
|
|
523
|
+
f"[bold]Quality Score: [{color}]{quality.overall:.0f}/100[/{color}] "
|
|
524
|
+
f"(Grade: [{color}]{quality.grade}[/{color}])[/bold]",
|
|
525
|
+
border_style=color,
|
|
526
|
+
))
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _display_discovery_results(analysis, ruleset) -> None:
|
|
530
|
+
"""Display discovery results."""
|
|
531
|
+
# Summary
|
|
532
|
+
console.print(f"[bold]Discovered {analysis.column_count} columns[/bold]\n")
|
|
533
|
+
|
|
534
|
+
# PII warning
|
|
535
|
+
if analysis.pii_columns:
|
|
536
|
+
console.print(Panel(
|
|
537
|
+
"[yellow]⚠️ PII Detected[/yellow]\n" +
|
|
538
|
+
"\n".join(f" • {col}" for col in analysis.pii_columns),
|
|
539
|
+
border_style="yellow",
|
|
540
|
+
))
|
|
541
|
+
console.print()
|
|
542
|
+
|
|
543
|
+
# Column analysis table
|
|
544
|
+
table = Table(title="Column Analysis")
|
|
545
|
+
table.add_column("Column", style="cyan")
|
|
546
|
+
table.add_column("Semantic Type", style="magenta")
|
|
547
|
+
table.add_column("Suggested Rules")
|
|
548
|
+
|
|
549
|
+
for col in analysis.columns[:15]:
|
|
550
|
+
sem = col.semantic_type.value
|
|
551
|
+
if col.is_pii:
|
|
552
|
+
sem = f"🔒 {sem}"
|
|
553
|
+
|
|
554
|
+
rules = ", ".join(col.suggested_validations[:3])
|
|
555
|
+
if len(col.suggested_validations) > 3:
|
|
556
|
+
rules += f" (+{len(col.suggested_validations) - 3})"
|
|
557
|
+
|
|
558
|
+
table.add_row(col.name, sem, rules or "-")
|
|
559
|
+
|
|
560
|
+
if len(analysis.columns) > 15:
|
|
561
|
+
table.add_row(f"... and {len(analysis.columns) - 15} more", "", "")
|
|
562
|
+
|
|
563
|
+
console.print(table)
|
|
564
|
+
console.print()
|
|
565
|
+
console.print(f"[dim]Generated {ruleset.total_checks} validation rules[/dim]")
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _display_contract(contract) -> None:
|
|
569
|
+
"""Display contract details."""
|
|
570
|
+
console.print(f"[bold]Contract: {contract.name}[/bold] v{contract.version}\n")
|
|
571
|
+
|
|
572
|
+
# Schema
|
|
573
|
+
table = Table(title="Schema")
|
|
574
|
+
table.add_column("Field", style="cyan")
|
|
575
|
+
table.add_column("Type", style="magenta")
|
|
576
|
+
table.add_column("Required")
|
|
577
|
+
table.add_column("Unique")
|
|
578
|
+
table.add_column("PII")
|
|
579
|
+
|
|
580
|
+
for field_obj in contract.schema[:15]:
|
|
581
|
+
type_str = field_obj.type.value if hasattr(field_obj.type, 'value') else str(field_obj.type)
|
|
582
|
+
table.add_row(
|
|
583
|
+
field_obj.name,
|
|
584
|
+
type_str,
|
|
585
|
+
"✓" if field_obj.required else "",
|
|
586
|
+
"✓" if field_obj.unique else "",
|
|
587
|
+
"🔒" if field_obj.pii else "",
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
console.print(table)
|
|
591
|
+
|
|
592
|
+
# Quality SLA
|
|
593
|
+
if contract.quality:
|
|
594
|
+
console.print("\n[bold]Quality SLA:[/bold]")
|
|
595
|
+
if contract.quality.completeness:
|
|
596
|
+
console.print(f" • Completeness: {contract.quality.completeness}%")
|
|
597
|
+
if contract.quality.row_count_min:
|
|
598
|
+
console.print(f" • Min rows: {contract.quality.row_count_min:,}")
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _display_contract_validation(result) -> None:
|
|
602
|
+
"""Display contract validation results."""
|
|
603
|
+
status = "[green]✓ PASSED[/green]" if result.passed else "[red]✗ FAILED[/red]"
|
|
604
|
+
console.print(f"Contract: [bold]{result.contract.name}[/bold] v{result.contract.version}")
|
|
605
|
+
console.print(f"Status: {status}\n")
|
|
606
|
+
|
|
607
|
+
if result.violations:
|
|
608
|
+
table = Table(title="Violations")
|
|
609
|
+
table.add_column("Type", style="magenta")
|
|
610
|
+
table.add_column("Field", style="cyan")
|
|
611
|
+
table.add_column("Message")
|
|
612
|
+
table.add_column("Severity")
|
|
613
|
+
|
|
614
|
+
for v in result.violations[:20]:
|
|
615
|
+
sev_style = {"error": "red", "warning": "yellow", "info": "dim"}.get(v.severity.value, "white")
|
|
616
|
+
table.add_row(
|
|
617
|
+
v.type.value,
|
|
618
|
+
v.field or "-",
|
|
619
|
+
v.message[:50],
|
|
620
|
+
f"[{sev_style}]{v.severity.value}[/{sev_style}]",
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
console.print(table)
|
|
624
|
+
else:
|
|
625
|
+
console.print("[green]No violations found[/green]")
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _display_contract_diff(diff) -> None:
|
|
629
|
+
"""Display contract diff."""
|
|
630
|
+
console.print(f"[bold]Comparing contracts[/bold]")
|
|
631
|
+
console.print(f" Old: v{diff.old_contract.version}")
|
|
632
|
+
console.print(f" New: v{diff.new_contract.version}\n")
|
|
633
|
+
|
|
634
|
+
if not diff.has_changes:
|
|
635
|
+
console.print("[green]No changes detected[/green]")
|
|
636
|
+
return
|
|
637
|
+
|
|
638
|
+
console.print(f"[bold]{len(diff.changes)} changes detected[/bold]\n")
|
|
639
|
+
|
|
640
|
+
if diff.breaking_changes:
|
|
641
|
+
console.print("[red bold]Breaking Changes:[/red bold]")
|
|
642
|
+
for change in diff.breaking_changes:
|
|
643
|
+
console.print(f" ❌ {change.message}")
|
|
644
|
+
console.print()
|
|
645
|
+
|
|
646
|
+
if diff.minor_changes:
|
|
647
|
+
console.print("[yellow bold]Minor Changes:[/yellow bold]")
|
|
648
|
+
for change in diff.minor_changes:
|
|
649
|
+
console.print(f" ⚠️ {change.message}")
|
|
650
|
+
console.print()
|
|
651
|
+
|
|
652
|
+
if diff.non_breaking_changes:
|
|
653
|
+
console.print("[dim]Non-breaking Changes:[/dim]")
|
|
654
|
+
for change in diff.non_breaking_changes:
|
|
655
|
+
console.print(f" • {change.message}")
|
|
656
|
+
|
|
657
|
+
console.print(f"\n[dim]Suggested version bump: {diff.suggest_version_bump()}[/dim]")
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def _display_anomaly_report(report) -> None:
|
|
661
|
+
"""Display anomaly detection report."""
|
|
662
|
+
if not report.has_anomalies:
|
|
663
|
+
console.print("[green]✓ No anomalies detected[/green]")
|
|
664
|
+
return
|
|
665
|
+
|
|
666
|
+
console.print(f"[yellow bold]⚠️ {report.anomaly_count} anomalies detected[/yellow bold]\n")
|
|
667
|
+
|
|
668
|
+
table = Table(title="Anomalies")
|
|
669
|
+
table.add_column("Column", style="cyan")
|
|
670
|
+
table.add_column("Type", style="magenta")
|
|
671
|
+
table.add_column("Score", justify="right")
|
|
672
|
+
table.add_column("Message")
|
|
673
|
+
|
|
674
|
+
for anomaly in report.get_anomalies():
|
|
675
|
+
table.add_row(
|
|
676
|
+
anomaly.column or "-",
|
|
677
|
+
anomaly.anomaly_type.value,
|
|
678
|
+
f"{anomaly.score:.2f}",
|
|
679
|
+
anomaly.message[:50],
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
console.print(table)
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def _save_results(output: str, dataset, results) -> None:
|
|
686
|
+
"""Save results to file."""
|
|
687
|
+
import json
|
|
688
|
+
|
|
689
|
+
data = {
|
|
690
|
+
"source": dataset.source,
|
|
691
|
+
"row_count": dataset.row_count,
|
|
692
|
+
"column_count": dataset.column_count,
|
|
693
|
+
"columns": dataset.columns,
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
if results:
|
|
697
|
+
data["checks"] = [
|
|
698
|
+
{"name": r[0], "passed": r[1], "details": r[2]}
|
|
699
|
+
for r in results
|
|
700
|
+
]
|
|
701
|
+
|
|
702
|
+
Path(output).write_text(json.dumps(data, indent=2))
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
if __name__ == "__main__":
|
|
706
|
+
app()
|