PyPI - duckguard - Versions diffs - 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl - Mend

duckguard 3.0.1py3-none-any.whl → 3.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

duckguard/__init__.py +1 -1
duckguard/ai/__init__.py +33 -0
duckguard/ai/config.py +201 -0
duckguard/ai/explainer.py +109 -0
duckguard/ai/fixer.py +105 -0
duckguard/ai/natural_language.py +119 -0
duckguard/ai/rules_generator.py +121 -0
duckguard/checks/conditional.py +4 -3
duckguard/cli/main.py +480 -93
duckguard/core/column.py +15 -5
duckguard/core/result.py +35 -14
duckguard/profiler/auto_profile.py +217 -64
duckguard/py.typed +0 -0
duckguard/reports/html_reporter.py +522 -37
duckguard/reports/pdf_reporter.py +33 -5
duckguard/semantic/detector.py +18 -7
duckguard-3.2.0.dist-info/METADATA +1206 -0
{duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/RECORD +22 -14
duckguard-3.2.0.dist-info/licenses/LICENSE +190 -0
duckguard-3.2.0.dist-info/licenses/NOTICE +7 -0
duckguard-3.0.1.dist-info/METADATA +0 -1072
duckguard-3.0.1.dist-info/licenses/LICENSE +0 -55
{duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/WHEEL +0 -0
{duckguard-3.0.1.dist-info → duckguard-3.2.0.dist-info}/entry_points.txt +0 -0

duckguard/reports/pdf_reporter.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Any
 from duckguard.reports.html_reporter import HTMLReporter, ReportConfig
 if TYPE_CHECKING:
-    from duckguard.history.storage import StoredRun
+    from duckguard.history.storage import StoredRun, TrendDataPoint
     from duckguard.rules.executor import ExecutionResult
@@ -45,6 +45,9 @@ class PDFReporter(HTMLReporter):
         output_path: str | Path,
         *,
         history: list[StoredRun] | None = None,
+        trend_data: list[TrendDataPoint] | None = None,
+        row_count: int | None = None,
+        column_count: int | None = None,
     ) -> Path:
         """Generate a PDF report.
@@ -52,6 +55,9 @@ class PDFReporter(HTMLReporter):
             result: ExecutionResult to report on
             output_path: Path to write PDF file
             history: Optional historical results for trends
+            trend_data: Optional trend data points for chart rendering
+            row_count: Optional dataset row count for metadata display
+            column_count: Optional dataset column count for metadata display
         Returns:
             Path to generated PDF report
@@ -63,8 +69,7 @@ class PDFReporter(HTMLReporter):
             from weasyprint import HTML
         except ImportError:
             raise ImportError(
-                "PDF reports require weasyprint. "
-                "Install with: pip install duckguard[reports]"
+                "PDF reports require weasyprint. " "Install with: pip install duckguard[reports]"
             )
         output_path = Path(output_path)
@@ -80,7 +85,14 @@ class PDFReporter(HTMLReporter):
         try:
             # Generate HTML report
-            super().generate(result, html_path, history=history)
+            super().generate(
+                result,
+                html_path,
+                history=history,
+                trend_data=trend_data,
+                row_count=row_count,
+                column_count=column_count,
+            )
             # Convert to PDF
             HTML(filename=str(html_path)).write_pdf(str(output_path))
@@ -97,6 +109,11 @@ class PDFReporter(HTMLReporter):
 def generate_pdf_report(
     result: ExecutionResult,
     output_path: str | Path,
+    *,
+    history: list[StoredRun] | None = None,
+    trend_data: list[TrendDataPoint] | None = None,
+    row_count: int | None = None,
+    column_count: int | None = None,
     **kwargs: Any,
 ) -> Path:
     """Convenience function to generate PDF report.
@@ -104,6 +121,10 @@ def generate_pdf_report(
     Args:
         result: ExecutionResult to report on
         output_path: Path to write PDF file
+        history: Optional historical results for trends
+        trend_data: Optional trend data points for chart rendering
+        row_count: Optional dataset row count
+        column_count: Optional dataset column count
         **kwargs: Additional ReportConfig options
     Returns:
@@ -111,4 +132,11 @@ def generate_pdf_report(
     """
     config = ReportConfig(**kwargs) if kwargs else None
     reporter = PDFReporter(config=config)
-    return reporter.generate(result, output_path)
+    return reporter.generate(
+        result,
+        output_path,
+        history=history,
+        trend_data=trend_data,
+        row_count=row_count,
+        column_count=column_count,
+    )

duckguard/semantic/detector.py CHANGED Viewed

@@ -178,7 +178,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
     SemanticType.CURRENCY: [
         r"amount", r"price", r"cost", r"total", r"subtotal",
         r"revenue", r"salary", r"fee", r"charge", r"balance",
-        r"payment", r".*_amt$", r".*_amount$"
+        r"payment", r"tax", r"shipping", r"discount", r"tip",
+        r".*_amt$", r".*_amount$", r".*_price$", r".*_cost$",
+        r"unit_price", r"list_price", r"net_.*", r"gross_.*"
     ],
     SemanticType.PERCENTAGE: [
         r"percent(age)?", r"rate", r"ratio", r"pct", r".*_pct$"
@@ -237,8 +239,8 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
     SemanticType.TIME: r"^\d{2}:\d{2}(:\d{2})?$",
     SemanticType.COUNTRY_CODE: r"^[A-Z]{2,3}$",
     SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
-    SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
-    SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
+    SemanticType.LATITUDE: r"^-?([1-8]?\d\.\d{4,}|90(\.0+)?)$",
+    SemanticType.LONGITUDE: r"^-?(1[0-7]\d\.\d{4,}|180(\.0+)?|\d{1,2}\.\d{4,})$",
     # Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
     SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
 }
@@ -386,12 +388,14 @@ class SemanticTypeDetector:
         reasons = []
         candidates: dict[SemanticType, float] = {}
-        # 1. Check column name patterns
+        # 1. Check column name patterns (name is strongest signal)
         name_lower = column_name.lower().replace("-", "_")
+        name_matched_types: set[SemanticType] = set()
         for sem_type, patterns in self.name_patterns.items():
             for pattern in patterns:
                 if re.match(pattern, name_lower, re.IGNORECASE):
-                    candidates[sem_type] = candidates.get(sem_type, 0) + 0.4
+                    candidates[sem_type] = candidates.get(sem_type, 0) + 0.6
+                    name_matched_types.add(sem_type)
                     reasons.append(f"Column name matches '{sem_type.value}' pattern")
                     break
@@ -408,13 +412,20 @@ class SemanticTypeDetector:
                     )
                     match_rate = match_count / min(len(string_values), 50)
+                    # Reduce score for ambiguous numeric types (lat/lon)
+                    # when no name hint supports them
+                    ambiguous_types = {SemanticType.LATITUDE, SemanticType.LONGITUDE, SemanticType.SLUG}
+                    penalty = 0.0
+                    if sem_type in ambiguous_types and sem_type not in name_matched_types:
+                        penalty = 0.2
                     if match_rate >= 0.8:
-                        candidates[sem_type] = candidates.get(sem_type, 0) + 0.5
+                        candidates[sem_type] = candidates.get(sem_type, 0) + 0.5 - penalty
                         reasons.append(
                             f"{match_rate:.0%} of values match {sem_type.value} pattern"
                         )
                     elif match_rate >= 0.5:
-                        candidates[sem_type] = candidates.get(sem_type, 0) + 0.3
+                        candidates[sem_type] = candidates.get(sem_type, 0) + 0.3 - penalty
                         reasons.append(
                             f"{match_rate:.0%} of values match {sem_type.value} pattern"
                         )

duckguard 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl

duckguard 3.0.1py3-none-any.whl → 3.2.0py3-none-any.whl