duckguard 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/ai/__init__.py +33 -0
- duckguard/ai/config.py +201 -0
- duckguard/ai/explainer.py +109 -0
- duckguard/ai/fixer.py +105 -0
- duckguard/ai/natural_language.py +119 -0
- duckguard/ai/rules_generator.py +121 -0
- duckguard/checks/conditional.py +4 -3
- duckguard/cli/main.py +480 -93
- duckguard/core/column.py +15 -5
- duckguard/core/result.py +35 -14
- duckguard/profiler/auto_profile.py +217 -64
- duckguard/py.typed +0 -0
- duckguard/reports/html_reporter.py +522 -37
- duckguard/reports/pdf_reporter.py +33 -5
- duckguard/semantic/detector.py +18 -7
- duckguard-3.2.0.dist-info/METADATA +1206 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/RECORD +22 -14
- duckguard-3.2.0.dist-info/licenses/LICENSE +190 -0
- duckguard-3.2.0.dist-info/licenses/NOTICE +7 -0
- duckguard-3.0.1.dist-info/METADATA +0 -1072
- duckguard-3.0.1.dist-info/licenses/LICENSE +0 -55
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
12
12
|
from duckguard.reports.html_reporter import HTMLReporter, ReportConfig
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
from duckguard.history.storage import StoredRun
|
|
15
|
+
from duckguard.history.storage import StoredRun, TrendDataPoint
|
|
16
16
|
from duckguard.rules.executor import ExecutionResult
|
|
17
17
|
|
|
18
18
|
|
|
@@ -45,6 +45,9 @@ class PDFReporter(HTMLReporter):
|
|
|
45
45
|
output_path: str | Path,
|
|
46
46
|
*,
|
|
47
47
|
history: list[StoredRun] | None = None,
|
|
48
|
+
trend_data: list[TrendDataPoint] | None = None,
|
|
49
|
+
row_count: int | None = None,
|
|
50
|
+
column_count: int | None = None,
|
|
48
51
|
) -> Path:
|
|
49
52
|
"""Generate a PDF report.
|
|
50
53
|
|
|
@@ -52,6 +55,9 @@ class PDFReporter(HTMLReporter):
|
|
|
52
55
|
result: ExecutionResult to report on
|
|
53
56
|
output_path: Path to write PDF file
|
|
54
57
|
history: Optional historical results for trends
|
|
58
|
+
trend_data: Optional trend data points for chart rendering
|
|
59
|
+
row_count: Optional dataset row count for metadata display
|
|
60
|
+
column_count: Optional dataset column count for metadata display
|
|
55
61
|
|
|
56
62
|
Returns:
|
|
57
63
|
Path to generated PDF report
|
|
@@ -63,8 +69,7 @@ class PDFReporter(HTMLReporter):
|
|
|
63
69
|
from weasyprint import HTML
|
|
64
70
|
except ImportError:
|
|
65
71
|
raise ImportError(
|
|
66
|
-
"PDF reports require weasyprint. "
|
|
67
|
-
"Install with: pip install duckguard[reports]"
|
|
72
|
+
"PDF reports require weasyprint. " "Install with: pip install duckguard[reports]"
|
|
68
73
|
)
|
|
69
74
|
|
|
70
75
|
output_path = Path(output_path)
|
|
@@ -80,7 +85,14 @@ class PDFReporter(HTMLReporter):
|
|
|
80
85
|
|
|
81
86
|
try:
|
|
82
87
|
# Generate HTML report
|
|
83
|
-
super().generate(
|
|
88
|
+
super().generate(
|
|
89
|
+
result,
|
|
90
|
+
html_path,
|
|
91
|
+
history=history,
|
|
92
|
+
trend_data=trend_data,
|
|
93
|
+
row_count=row_count,
|
|
94
|
+
column_count=column_count,
|
|
95
|
+
)
|
|
84
96
|
|
|
85
97
|
# Convert to PDF
|
|
86
98
|
HTML(filename=str(html_path)).write_pdf(str(output_path))
|
|
@@ -97,6 +109,11 @@ class PDFReporter(HTMLReporter):
|
|
|
97
109
|
def generate_pdf_report(
|
|
98
110
|
result: ExecutionResult,
|
|
99
111
|
output_path: str | Path,
|
|
112
|
+
*,
|
|
113
|
+
history: list[StoredRun] | None = None,
|
|
114
|
+
trend_data: list[TrendDataPoint] | None = None,
|
|
115
|
+
row_count: int | None = None,
|
|
116
|
+
column_count: int | None = None,
|
|
100
117
|
**kwargs: Any,
|
|
101
118
|
) -> Path:
|
|
102
119
|
"""Convenience function to generate PDF report.
|
|
@@ -104,6 +121,10 @@ def generate_pdf_report(
|
|
|
104
121
|
Args:
|
|
105
122
|
result: ExecutionResult to report on
|
|
106
123
|
output_path: Path to write PDF file
|
|
124
|
+
history: Optional historical results for trends
|
|
125
|
+
trend_data: Optional trend data points for chart rendering
|
|
126
|
+
row_count: Optional dataset row count
|
|
127
|
+
column_count: Optional dataset column count
|
|
107
128
|
**kwargs: Additional ReportConfig options
|
|
108
129
|
|
|
109
130
|
Returns:
|
|
@@ -111,4 +132,11 @@ def generate_pdf_report(
|
|
|
111
132
|
"""
|
|
112
133
|
config = ReportConfig(**kwargs) if kwargs else None
|
|
113
134
|
reporter = PDFReporter(config=config)
|
|
114
|
-
return reporter.generate(
|
|
135
|
+
return reporter.generate(
|
|
136
|
+
result,
|
|
137
|
+
output_path,
|
|
138
|
+
history=history,
|
|
139
|
+
trend_data=trend_data,
|
|
140
|
+
row_count=row_count,
|
|
141
|
+
column_count=column_count,
|
|
142
|
+
)
|
duckguard/semantic/detector.py
CHANGED
|
@@ -178,7 +178,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
|
|
|
178
178
|
SemanticType.CURRENCY: [
|
|
179
179
|
r"amount", r"price", r"cost", r"total", r"subtotal",
|
|
180
180
|
r"revenue", r"salary", r"fee", r"charge", r"balance",
|
|
181
|
-
r"payment", r"
|
|
181
|
+
r"payment", r"tax", r"shipping", r"discount", r"tip",
|
|
182
|
+
r".*_amt$", r".*_amount$", r".*_price$", r".*_cost$",
|
|
183
|
+
r"unit_price", r"list_price", r"net_.*", r"gross_.*"
|
|
182
184
|
],
|
|
183
185
|
SemanticType.PERCENTAGE: [
|
|
184
186
|
r"percent(age)?", r"rate", r"ratio", r"pct", r".*_pct$"
|
|
@@ -237,8 +239,8 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
|
|
|
237
239
|
SemanticType.TIME: r"^\d{2}:\d{2}(:\d{2})?$",
|
|
238
240
|
SemanticType.COUNTRY_CODE: r"^[A-Z]{2,3}$",
|
|
239
241
|
SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
240
|
-
SemanticType.LATITUDE: r"^-?([1-8]?\d
|
|
241
|
-
SemanticType.LONGITUDE: r"^-?(1[0-7]\d
|
|
242
|
+
SemanticType.LATITUDE: r"^-?([1-8]?\d\.\d{4,}|90(\.0+)?)$",
|
|
243
|
+
SemanticType.LONGITUDE: r"^-?(1[0-7]\d\.\d{4,}|180(\.0+)?|\d{1,2}\.\d{4,})$",
|
|
242
244
|
# Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
|
|
243
245
|
SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
|
|
244
246
|
}
|
|
@@ -386,12 +388,14 @@ class SemanticTypeDetector:
|
|
|
386
388
|
reasons = []
|
|
387
389
|
candidates: dict[SemanticType, float] = {}
|
|
388
390
|
|
|
389
|
-
# 1. Check column name patterns
|
|
391
|
+
# 1. Check column name patterns (name is strongest signal)
|
|
390
392
|
name_lower = column_name.lower().replace("-", "_")
|
|
393
|
+
name_matched_types: set[SemanticType] = set()
|
|
391
394
|
for sem_type, patterns in self.name_patterns.items():
|
|
392
395
|
for pattern in patterns:
|
|
393
396
|
if re.match(pattern, name_lower, re.IGNORECASE):
|
|
394
|
-
candidates[sem_type] = candidates.get(sem_type, 0) + 0.
|
|
397
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.6
|
|
398
|
+
name_matched_types.add(sem_type)
|
|
395
399
|
reasons.append(f"Column name matches '{sem_type.value}' pattern")
|
|
396
400
|
break
|
|
397
401
|
|
|
@@ -408,13 +412,20 @@ class SemanticTypeDetector:
|
|
|
408
412
|
)
|
|
409
413
|
match_rate = match_count / min(len(string_values), 50)
|
|
410
414
|
|
|
415
|
+
# Reduce score for ambiguous numeric types (lat/lon)
|
|
416
|
+
# when no name hint supports them
|
|
417
|
+
ambiguous_types = {SemanticType.LATITUDE, SemanticType.LONGITUDE, SemanticType.SLUG}
|
|
418
|
+
penalty = 0.0
|
|
419
|
+
if sem_type in ambiguous_types and sem_type not in name_matched_types:
|
|
420
|
+
penalty = 0.2
|
|
421
|
+
|
|
411
422
|
if match_rate >= 0.8:
|
|
412
|
-
candidates[sem_type] = candidates.get(sem_type, 0) + 0.5
|
|
423
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.5 - penalty
|
|
413
424
|
reasons.append(
|
|
414
425
|
f"{match_rate:.0%} of values match {sem_type.value} pattern"
|
|
415
426
|
)
|
|
416
427
|
elif match_rate >= 0.5:
|
|
417
|
-
candidates[sem_type] = candidates.get(sem_type, 0) + 0.3
|
|
428
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.3 - penalty
|
|
418
429
|
reasons.append(
|
|
419
430
|
f"{match_rate:.0%} of values match {sem_type.value} pattern"
|
|
420
431
|
)
|