duckguard 3.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Any
12
12
  from duckguard.reports.html_reporter import HTMLReporter, ReportConfig
13
13
 
14
14
  if TYPE_CHECKING:
15
- from duckguard.history.storage import StoredRun
15
+ from duckguard.history.storage import StoredRun, TrendDataPoint
16
16
  from duckguard.rules.executor import ExecutionResult
17
17
 
18
18
 
@@ -45,6 +45,9 @@ class PDFReporter(HTMLReporter):
45
45
  output_path: str | Path,
46
46
  *,
47
47
  history: list[StoredRun] | None = None,
48
+ trend_data: list[TrendDataPoint] | None = None,
49
+ row_count: int | None = None,
50
+ column_count: int | None = None,
48
51
  ) -> Path:
49
52
  """Generate a PDF report.
50
53
 
@@ -52,6 +55,9 @@ class PDFReporter(HTMLReporter):
52
55
  result: ExecutionResult to report on
53
56
  output_path: Path to write PDF file
54
57
  history: Optional historical results for trends
58
+ trend_data: Optional trend data points for chart rendering
59
+ row_count: Optional dataset row count for metadata display
60
+ column_count: Optional dataset column count for metadata display
55
61
 
56
62
  Returns:
57
63
  Path to generated PDF report
@@ -63,8 +69,7 @@ class PDFReporter(HTMLReporter):
63
69
  from weasyprint import HTML
64
70
  except ImportError:
65
71
  raise ImportError(
66
- "PDF reports require weasyprint. "
67
- "Install with: pip install duckguard[reports]"
72
+ "PDF reports require weasyprint. " "Install with: pip install duckguard[reports]"
68
73
  )
69
74
 
70
75
  output_path = Path(output_path)
@@ -80,7 +85,14 @@ class PDFReporter(HTMLReporter):
80
85
 
81
86
  try:
82
87
  # Generate HTML report
83
- super().generate(result, html_path, history=history)
88
+ super().generate(
89
+ result,
90
+ html_path,
91
+ history=history,
92
+ trend_data=trend_data,
93
+ row_count=row_count,
94
+ column_count=column_count,
95
+ )
84
96
 
85
97
  # Convert to PDF
86
98
  HTML(filename=str(html_path)).write_pdf(str(output_path))
@@ -97,6 +109,11 @@ class PDFReporter(HTMLReporter):
97
109
  def generate_pdf_report(
98
110
  result: ExecutionResult,
99
111
  output_path: str | Path,
112
+ *,
113
+ history: list[StoredRun] | None = None,
114
+ trend_data: list[TrendDataPoint] | None = None,
115
+ row_count: int | None = None,
116
+ column_count: int | None = None,
100
117
  **kwargs: Any,
101
118
  ) -> Path:
102
119
  """Convenience function to generate PDF report.
@@ -104,6 +121,10 @@ def generate_pdf_report(
104
121
  Args:
105
122
  result: ExecutionResult to report on
106
123
  output_path: Path to write PDF file
124
+ history: Optional historical results for trends
125
+ trend_data: Optional trend data points for chart rendering
126
+ row_count: Optional dataset row count
127
+ column_count: Optional dataset column count
107
128
  **kwargs: Additional ReportConfig options
108
129
 
109
130
  Returns:
@@ -111,4 +132,11 @@ def generate_pdf_report(
111
132
  """
112
133
  config = ReportConfig(**kwargs) if kwargs else None
113
134
  reporter = PDFReporter(config=config)
114
- return reporter.generate(result, output_path)
135
+ return reporter.generate(
136
+ result,
137
+ output_path,
138
+ history=history,
139
+ trend_data=trend_data,
140
+ row_count=row_count,
141
+ column_count=column_count,
142
+ )
@@ -178,7 +178,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
178
178
  SemanticType.CURRENCY: [
179
179
  r"amount", r"price", r"cost", r"total", r"subtotal",
180
180
  r"revenue", r"salary", r"fee", r"charge", r"balance",
181
- r"payment", r".*_amt$", r".*_amount$"
181
+ r"payment", r"tax", r"shipping", r"discount", r"tip",
182
+ r".*_amt$", r".*_amount$", r".*_price$", r".*_cost$",
183
+ r"unit_price", r"list_price", r"net_.*", r"gross_.*"
182
184
  ],
183
185
  SemanticType.PERCENTAGE: [
184
186
  r"percent(age)?", r"rate", r"ratio", r"pct", r".*_pct$"
@@ -237,8 +239,8 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
237
239
  SemanticType.TIME: r"^\d{2}:\d{2}(:\d{2})?$",
238
240
  SemanticType.COUNTRY_CODE: r"^[A-Z]{2,3}$",
239
241
  SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
240
- SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
241
- SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
242
+ SemanticType.LATITUDE: r"^-?([1-8]?\d\.\d{4,}|90(\.0+)?)$",
243
+ SemanticType.LONGITUDE: r"^-?(1[0-7]\d\.\d{4,}|180(\.0+)?|\d{1,2}\.\d{4,})$",
242
244
  # Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
243
245
  SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
244
246
  }
@@ -386,12 +388,14 @@ class SemanticTypeDetector:
386
388
  reasons = []
387
389
  candidates: dict[SemanticType, float] = {}
388
390
 
389
- # 1. Check column name patterns
391
+ # 1. Check column name patterns (name is strongest signal)
390
392
  name_lower = column_name.lower().replace("-", "_")
393
+ name_matched_types: set[SemanticType] = set()
391
394
  for sem_type, patterns in self.name_patterns.items():
392
395
  for pattern in patterns:
393
396
  if re.match(pattern, name_lower, re.IGNORECASE):
394
- candidates[sem_type] = candidates.get(sem_type, 0) + 0.4
397
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.6
398
+ name_matched_types.add(sem_type)
395
399
  reasons.append(f"Column name matches '{sem_type.value}' pattern")
396
400
  break
397
401
 
@@ -408,13 +412,20 @@ class SemanticTypeDetector:
408
412
  )
409
413
  match_rate = match_count / min(len(string_values), 50)
410
414
 
415
+ # Reduce score for ambiguous numeric types (lat/lon)
416
+ # when no name hint supports them
417
+ ambiguous_types = {SemanticType.LATITUDE, SemanticType.LONGITUDE, SemanticType.SLUG}
418
+ penalty = 0.0
419
+ if sem_type in ambiguous_types and sem_type not in name_matched_types:
420
+ penalty = 0.2
421
+
411
422
  if match_rate >= 0.8:
412
- candidates[sem_type] = candidates.get(sem_type, 0) + 0.5
423
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.5 - penalty
413
424
  reasons.append(
414
425
  f"{match_rate:.0%} of values match {sem_type.value} pattern"
415
426
  )
416
427
  elif match_rate >= 0.5:
417
- candidates[sem_type] = candidates.get(sem_type, 0) + 0.3
428
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.3 - penalty
418
429
  reasons.append(
419
430
  f"{match_rate:.0%} of values match {sem_type.value} pattern"
420
431
  )