duckguard 3.1.0__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/ai/__init__.py +33 -0
- duckguard/ai/config.py +201 -0
- duckguard/ai/explainer.py +109 -0
- duckguard/ai/fixer.py +105 -0
- duckguard/ai/natural_language.py +119 -0
- duckguard/ai/rules_generator.py +121 -0
- duckguard/checks/conditional.py +4 -3
- duckguard/cli/main.py +156 -4
- duckguard/core/column.py +15 -5
- duckguard/py.typed +0 -0
- duckguard/reports/html_reporter.py +522 -37
- duckguard/reports/pdf_reporter.py +33 -5
- duckguard/semantic/detector.py +18 -7
- {duckguard-3.1.0.dist-info → duckguard-3.2.0.dist-info}/METADATA +98 -25
- {duckguard-3.1.0.dist-info → duckguard-3.2.0.dist-info}/RECORD +20 -12
- duckguard-3.2.0.dist-info/licenses/LICENSE +190 -0
- duckguard-3.2.0.dist-info/licenses/NOTICE +7 -0
- duckguard-3.1.0.dist-info/licenses/LICENSE +0 -55
- {duckguard-3.1.0.dist-info → duckguard-3.2.0.dist-info}/WHEEL +0 -0
- {duckguard-3.1.0.dist-info → duckguard-3.2.0.dist-info}/entry_points.txt +0 -0
duckguard/checks/conditional.py
CHANGED
|
@@ -609,9 +609,9 @@ class ConditionalCheckHandler:
|
|
|
609
609
|
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
610
610
|
source_path = dataset._source.replace('\\', '/')
|
|
611
611
|
|
|
612
|
-
# Format allowed values for SQL IN clause
|
|
612
|
+
# Format allowed values for SQL IN clause (with proper escaping)
|
|
613
613
|
if isinstance(allowed_values[0], str):
|
|
614
|
-
values_str = ", ".join(f"'{v}'" for v in allowed_values)
|
|
614
|
+
values_str = ", ".join(f"'{v.replace(chr(39), chr(39)+chr(39))}'" for v in allowed_values)
|
|
615
615
|
else:
|
|
616
616
|
values_str = ", ".join(str(v) for v in allowed_values)
|
|
617
617
|
|
|
@@ -701,11 +701,12 @@ class ConditionalCheckHandler:
|
|
|
701
701
|
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
702
702
|
source_path = dataset._source.replace('\\', '/')
|
|
703
703
|
|
|
704
|
+
safe_pattern = pattern.replace("'", "''")
|
|
704
705
|
sql = f"""
|
|
705
706
|
SELECT COUNT(*) as violations
|
|
706
707
|
FROM '{source_path}'
|
|
707
708
|
WHERE ({condition})
|
|
708
|
-
AND NOT regexp_matches({column}::VARCHAR, '{
|
|
709
|
+
AND NOT regexp_matches({column}::VARCHAR, '{safe_pattern}')
|
|
709
710
|
"""
|
|
710
711
|
|
|
711
712
|
try:
|
duckguard/cli/main.py
CHANGED
|
@@ -1118,20 +1118,31 @@ def report(
|
|
|
1118
1118
|
True, "--include-passed/--no-passed", help="Include passed checks"
|
|
1119
1119
|
),
|
|
1120
1120
|
store: bool = typer.Option(False, "--store", "-s", help="Store results in history"),
|
|
1121
|
+
trends: bool = typer.Option(
|
|
1122
|
+
False, "--trends", help="Include quality trend charts from history"
|
|
1123
|
+
),
|
|
1124
|
+
trend_days: int = typer.Option(
|
|
1125
|
+
30, "--trend-days", help="Number of days of history for trend charts"
|
|
1126
|
+
),
|
|
1127
|
+
dark_mode: str = typer.Option("auto", "--dark-mode", help="Theme mode: auto, light, dark"),
|
|
1128
|
+
logo: str | None = typer.Option(None, "--logo", help="Logo URL or data URI for report header"),
|
|
1121
1129
|
) -> None:
|
|
1122
1130
|
"""
|
|
1123
1131
|
Generate a data quality report (HTML or PDF).
|
|
1124
1132
|
|
|
1125
|
-
Runs validation checks and generates a beautiful, shareable report
|
|
1133
|
+
Runs validation checks and generates a beautiful, shareable report
|
|
1134
|
+
with dark mode, interactive tables, and optional trend charts.
|
|
1126
1135
|
|
|
1127
1136
|
[bold]Examples:[/bold]
|
|
1128
1137
|
duckguard report data.csv
|
|
1129
1138
|
duckguard report data.csv --format pdf --output report.pdf
|
|
1130
1139
|
duckguard report data.csv --config rules.yaml --title "Orders Quality"
|
|
1131
1140
|
duckguard report data.csv --store # Also save to history
|
|
1141
|
+
duckguard report data.csv --trends # Include quality trend charts
|
|
1142
|
+
duckguard report data.csv --dark-mode dark # Force dark theme
|
|
1132
1143
|
"""
|
|
1133
1144
|
from duckguard.connectors import connect
|
|
1134
|
-
from duckguard.reports import
|
|
1145
|
+
from duckguard.reports import HTMLReporter, PDFReporter, ReportConfig
|
|
1135
1146
|
from duckguard.rules import execute_rules, generate_rules, load_rules
|
|
1136
1147
|
|
|
1137
1148
|
# Determine output path based on format
|
|
@@ -1182,6 +1193,31 @@ def report(
|
|
|
1182
1193
|
console.print(f"Quality Score: [cyan]{result.quality_score:.1f}%[/cyan]")
|
|
1183
1194
|
console.print(f"Checks: {result.passed_count}/{result.total_checks} passed\n")
|
|
1184
1195
|
|
|
1196
|
+
# Load trend data if requested
|
|
1197
|
+
trend_data = None
|
|
1198
|
+
history_runs = None
|
|
1199
|
+
if trends:
|
|
1200
|
+
from duckguard.history import HistoryStorage
|
|
1201
|
+
|
|
1202
|
+
try:
|
|
1203
|
+
storage_for_trends = HistoryStorage()
|
|
1204
|
+
trend_data = storage_for_trends.get_trend(source, days=trend_days)
|
|
1205
|
+
history_runs = storage_for_trends.get_runs(source, limit=20)
|
|
1206
|
+
if not trend_data:
|
|
1207
|
+
console.print("[dim]No historical data found for trend charts[/dim]")
|
|
1208
|
+
except Exception:
|
|
1209
|
+
console.print("[dim]No historical data found for trend charts[/dim]")
|
|
1210
|
+
|
|
1211
|
+
# Build report config
|
|
1212
|
+
report_config = ReportConfig(
|
|
1213
|
+
title=title,
|
|
1214
|
+
include_passed=include_passed,
|
|
1215
|
+
include_trends=trends,
|
|
1216
|
+
trend_days=trend_days,
|
|
1217
|
+
dark_mode=dark_mode,
|
|
1218
|
+
logo_url=logo,
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1185
1221
|
# Generate report
|
|
1186
1222
|
with Progress(
|
|
1187
1223
|
SpinnerColumn(),
|
|
@@ -1192,9 +1228,18 @@ def report(
|
|
|
1192
1228
|
progress.add_task(f"Generating {output_format.upper()} report...", total=None)
|
|
1193
1229
|
|
|
1194
1230
|
if output_format.lower() == "pdf":
|
|
1195
|
-
|
|
1231
|
+
reporter = PDFReporter(config=report_config)
|
|
1196
1232
|
else:
|
|
1197
|
-
|
|
1233
|
+
reporter = HTMLReporter(config=report_config)
|
|
1234
|
+
|
|
1235
|
+
reporter.generate(
|
|
1236
|
+
result,
|
|
1237
|
+
output,
|
|
1238
|
+
history=history_runs,
|
|
1239
|
+
trend_data=trend_data,
|
|
1240
|
+
row_count=dataset.row_count,
|
|
1241
|
+
column_count=dataset.column_count,
|
|
1242
|
+
)
|
|
1198
1243
|
|
|
1199
1244
|
console.print(f"[green]SAVED[/green] Report saved to [cyan]{output}[/cyan]")
|
|
1200
1245
|
console.print("[dim]Open in browser to view the report[/dim]")
|
|
@@ -1442,5 +1487,112 @@ def schema(
|
|
|
1442
1487
|
raise typer.Exit(1)
|
|
1443
1488
|
|
|
1444
1489
|
|
|
1490
|
+
# =========================================================================
|
|
1491
|
+
# AI-Powered Commands
|
|
1492
|
+
# =========================================================================
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
@app.command()
|
|
1496
|
+
def explain(
|
|
1497
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1498
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1499
|
+
focus: str | None = typer.Option(None, "--focus", "-f", help="Column or aspect to focus on"),
|
|
1500
|
+
detail: str = typer.Option("medium", "--detail", "-d", help="Detail level: brief, medium, detailed"),
|
|
1501
|
+
) -> None:
|
|
1502
|
+
"""Explain data quality issues in plain English (AI-powered).
|
|
1503
|
+
|
|
1504
|
+
Requires: pip install duckguard[llm]
|
|
1505
|
+
"""
|
|
1506
|
+
try:
|
|
1507
|
+
from duckguard.ai import explain as ai_explain
|
|
1508
|
+
from duckguard.connectors import connect as dg_connect
|
|
1509
|
+
|
|
1510
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1511
|
+
dataset = dg_connect(source, table=table)
|
|
1512
|
+
|
|
1513
|
+
with console.status("[bold green]Analyzing with AI..."):
|
|
1514
|
+
result = ai_explain(dataset, focus=focus, detail=detail)
|
|
1515
|
+
|
|
1516
|
+
console.print()
|
|
1517
|
+
console.print(Panel(result, title="[bold]Data Quality Explanation[/bold]", border_style="green"))
|
|
1518
|
+
|
|
1519
|
+
except ImportError:
|
|
1520
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1521
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1522
|
+
raise typer.Exit(1)
|
|
1523
|
+
except Exception as e:
|
|
1524
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1525
|
+
raise typer.Exit(1)
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
@app.command()
|
|
1529
|
+
def suggest(
|
|
1530
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1531
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1532
|
+
output: str | None = typer.Option(None, "--output", "-o", help="Output file (default: stdout)"),
|
|
1533
|
+
strict: bool = typer.Option(False, "--strict", help="Generate stricter rules"),
|
|
1534
|
+
) -> None:
|
|
1535
|
+
"""Generate validation rules using AI (AI-powered).
|
|
1536
|
+
|
|
1537
|
+
Requires: pip install duckguard[llm]
|
|
1538
|
+
"""
|
|
1539
|
+
try:
|
|
1540
|
+
from duckguard.ai import suggest_rules
|
|
1541
|
+
from duckguard.connectors import connect as dg_connect
|
|
1542
|
+
|
|
1543
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1544
|
+
dataset = dg_connect(source, table=table)
|
|
1545
|
+
|
|
1546
|
+
with console.status("[bold green]Generating rules with AI..."):
|
|
1547
|
+
rules_yaml = suggest_rules(dataset, strict=strict)
|
|
1548
|
+
|
|
1549
|
+
if output:
|
|
1550
|
+
with open(output, "w") as f:
|
|
1551
|
+
f.write(rules_yaml)
|
|
1552
|
+
console.print(f"[green]Rules written to {output}[/green]")
|
|
1553
|
+
else:
|
|
1554
|
+
console.print()
|
|
1555
|
+
console.print(Syntax(rules_yaml, "yaml", theme="monokai"))
|
|
1556
|
+
|
|
1557
|
+
except ImportError:
|
|
1558
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1559
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1560
|
+
raise typer.Exit(1)
|
|
1561
|
+
except Exception as e:
|
|
1562
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1563
|
+
raise typer.Exit(1)
|
|
1564
|
+
|
|
1565
|
+
|
|
1566
|
+
@app.command()
|
|
1567
|
+
def fix(
|
|
1568
|
+
source: str = typer.Argument(..., help="Path to file or connection string"),
|
|
1569
|
+
table: str | None = typer.Option(None, "--table", "-t", help="Table name"),
|
|
1570
|
+
) -> None:
|
|
1571
|
+
"""Suggest data quality fixes using AI (AI-powered).
|
|
1572
|
+
|
|
1573
|
+
Requires: pip install duckguard[llm]
|
|
1574
|
+
"""
|
|
1575
|
+
try:
|
|
1576
|
+
from duckguard.ai import suggest_fixes
|
|
1577
|
+
from duckguard.connectors import connect as dg_connect
|
|
1578
|
+
|
|
1579
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console):
|
|
1580
|
+
dataset = dg_connect(source, table=table)
|
|
1581
|
+
|
|
1582
|
+
with console.status("[bold green]Analyzing fixes with AI..."):
|
|
1583
|
+
result = suggest_fixes(dataset)
|
|
1584
|
+
|
|
1585
|
+
console.print()
|
|
1586
|
+
console.print(Panel(result, title="[bold]Suggested Fixes[/bold]", border_style="yellow"))
|
|
1587
|
+
|
|
1588
|
+
except ImportError:
|
|
1589
|
+
console.print("[red]Error:[/red] AI features require LLM packages.")
|
|
1590
|
+
console.print("Install with: [bold]pip install duckguard[llm][/bold]")
|
|
1591
|
+
raise typer.Exit(1)
|
|
1592
|
+
except Exception as e:
|
|
1593
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1594
|
+
raise typer.Exit(1)
|
|
1595
|
+
|
|
1596
|
+
|
|
1445
1597
|
if __name__ == "__main__":
|
|
1446
1598
|
app()
|
duckguard/core/column.py
CHANGED
|
@@ -13,6 +13,14 @@ if TYPE_CHECKING:
|
|
|
13
13
|
DEFAULT_SAMPLE_SIZE = 10
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def _escape_sql_string(value: str) -> str:
|
|
17
|
+
"""Escape a string value for safe use in SQL queries.
|
|
18
|
+
|
|
19
|
+
Replaces single quotes with doubled single quotes (SQL standard escaping).
|
|
20
|
+
"""
|
|
21
|
+
return value.replace("'", "''")
|
|
22
|
+
|
|
23
|
+
|
|
16
24
|
class Column:
|
|
17
25
|
"""
|
|
18
26
|
Represents a column in a dataset with validation capabilities.
|
|
@@ -246,11 +254,12 @@ class Column:
|
|
|
246
254
|
col = f'"{self._name}"'
|
|
247
255
|
|
|
248
256
|
# DuckDB uses regexp_matches for regex
|
|
257
|
+
safe_pattern = _escape_sql_string(pattern)
|
|
249
258
|
sql = f"""
|
|
250
259
|
SELECT COUNT(*) as non_matching
|
|
251
260
|
FROM {ref}
|
|
252
261
|
WHERE {col} IS NOT NULL
|
|
253
|
-
AND NOT regexp_matches({col}::VARCHAR, '{
|
|
262
|
+
AND NOT regexp_matches({col}::VARCHAR, '{safe_pattern}')
|
|
254
263
|
"""
|
|
255
264
|
|
|
256
265
|
non_matching = self._dataset.engine.fetch_value(sql) or 0
|
|
@@ -275,12 +284,13 @@ class Column:
|
|
|
275
284
|
"""Get sample of rows that failed pattern match."""
|
|
276
285
|
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
277
286
|
col = f'"{self._name}"'
|
|
287
|
+
safe_pattern = _escape_sql_string(pattern)
|
|
278
288
|
|
|
279
289
|
sql = f"""
|
|
280
290
|
SELECT row_number() OVER () as row_idx, {col} as val
|
|
281
291
|
FROM {ref}
|
|
282
292
|
WHERE {col} IS NOT NULL
|
|
283
|
-
AND NOT regexp_matches({col}::VARCHAR, '{
|
|
293
|
+
AND NOT regexp_matches({col}::VARCHAR, '{safe_pattern}')
|
|
284
294
|
LIMIT {limit}
|
|
285
295
|
"""
|
|
286
296
|
|
|
@@ -310,9 +320,9 @@ class Column:
|
|
|
310
320
|
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
311
321
|
col = f'"{self._name}"'
|
|
312
322
|
|
|
313
|
-
# Build value list for SQL
|
|
323
|
+
# Build value list for SQL (with proper escaping)
|
|
314
324
|
formatted_values = ", ".join(
|
|
315
|
-
f"'{v}'" if isinstance(v, str) else str(v) for v in values
|
|
325
|
+
f"'{_escape_sql_string(str(v))}'" if isinstance(v, str) else str(v) for v in values
|
|
316
326
|
)
|
|
317
327
|
|
|
318
328
|
sql = f"""
|
|
@@ -346,7 +356,7 @@ class Column:
|
|
|
346
356
|
col = f'"{self._name}"'
|
|
347
357
|
|
|
348
358
|
formatted_values = ", ".join(
|
|
349
|
-
f"'{v}'" if isinstance(v, str) else str(v) for v in values
|
|
359
|
+
f"'{_escape_sql_string(str(v))}'" if isinstance(v, str) else str(v) for v in values
|
|
350
360
|
)
|
|
351
361
|
|
|
352
362
|
sql = f"""
|
duckguard/py.typed
ADDED
|
File without changes
|