duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,288 @@
1
+ """Schema tracking implementation.
2
+
3
+ Provides functionality to capture and store schema snapshots over time.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import uuid
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ from duckguard.history.schema import QUERIES
15
+ from duckguard.history.storage import HistoryStorage
16
+
17
+ if TYPE_CHECKING:
18
+ from duckguard.core.dataset import Dataset
19
+
20
+
21
+ @dataclass
22
+ class ColumnSchema:
23
+ """Represents the schema of a single column.
24
+
25
+ Attributes:
26
+ name: Column name
27
+ dtype: Data type as string
28
+ nullable: Whether the column allows nulls
29
+ position: Position in the table (0-indexed)
30
+ """
31
+
32
+ name: str
33
+ dtype: str
34
+ nullable: bool
35
+ position: int
36
+
37
+ def to_dict(self) -> dict[str, Any]:
38
+ """Convert to dictionary."""
39
+ return {
40
+ "name": self.name,
41
+ "dtype": self.dtype,
42
+ "nullable": self.nullable,
43
+ "position": self.position,
44
+ }
45
+
46
+ @classmethod
47
+ def from_dict(cls, data: dict[str, Any]) -> ColumnSchema:
48
+ """Create from dictionary."""
49
+ return cls(
50
+ name=data["name"],
51
+ dtype=data["dtype"],
52
+ nullable=data.get("nullable", True),
53
+ position=data.get("position", 0),
54
+ )
55
+
56
+
57
+ @dataclass
58
+ class SchemaSnapshot:
59
+ """Represents a captured schema at a point in time.
60
+
61
+ Attributes:
62
+ source: Data source path
63
+ snapshot_id: Unique identifier for this snapshot
64
+ captured_at: When the snapshot was captured
65
+ columns: List of column schemas
66
+ row_count: Optional row count at capture time
67
+ """
68
+
69
+ source: str
70
+ snapshot_id: str
71
+ captured_at: datetime
72
+ columns: list[ColumnSchema]
73
+ row_count: int | None = None
74
+
75
+ @property
76
+ def column_count(self) -> int:
77
+ """Get the number of columns."""
78
+ return len(self.columns)
79
+
80
+ @property
81
+ def column_names(self) -> list[str]:
82
+ """Get list of column names."""
83
+ return [c.name for c in self.columns]
84
+
85
+ def get_column(self, name: str) -> ColumnSchema | None:
86
+ """Get a column by name."""
87
+ for col in self.columns:
88
+ if col.name == name:
89
+ return col
90
+ return None
91
+
92
+ def to_dict(self) -> dict[str, Any]:
93
+ """Convert to dictionary for JSON serialization."""
94
+ return {
95
+ "source": self.source,
96
+ "snapshot_id": self.snapshot_id,
97
+ "captured_at": self.captured_at.isoformat(),
98
+ "columns": [c.to_dict() for c in self.columns],
99
+ "row_count": self.row_count,
100
+ }
101
+
102
+ @classmethod
103
+ def from_dict(cls, data: dict[str, Any]) -> SchemaSnapshot:
104
+ """Create from dictionary."""
105
+ return cls(
106
+ source=data["source"],
107
+ snapshot_id=data["snapshot_id"],
108
+ captured_at=datetime.fromisoformat(data["captured_at"]),
109
+ columns=[ColumnSchema.from_dict(c) for c in data["columns"]],
110
+ row_count=data.get("row_count"),
111
+ )
112
+
113
+ def __eq__(self, other: object) -> bool:
114
+ """Check schema equality (ignores snapshot_id and captured_at)."""
115
+ if not isinstance(other, SchemaSnapshot):
116
+ return False
117
+ return (
118
+ self.source == other.source
119
+ and len(self.columns) == len(other.columns)
120
+ and all(
121
+ c1.name == c2.name and c1.dtype == c2.dtype and c1.nullable == c2.nullable
122
+ for c1, c2 in zip(self.columns, other.columns)
123
+ )
124
+ )
125
+
126
+
127
+ class SchemaTracker:
128
+ """Track schema changes over time.
129
+
130
+ Usage:
131
+ from duckguard import connect
132
+ from duckguard.schema_history import SchemaTracker
133
+
134
+ tracker = SchemaTracker()
135
+ data = connect("data.csv")
136
+
137
+ # Capture current schema
138
+ snapshot = tracker.capture(data)
139
+
140
+ # Get history
141
+ history = tracker.get_history(data.source)
142
+
143
+ # Get latest snapshot
144
+ latest = tracker.get_latest(data.source)
145
+ """
146
+
147
+ def __init__(self, storage: HistoryStorage | None = None):
148
+ """Initialize schema tracker.
149
+
150
+ Args:
151
+ storage: Optional HistoryStorage instance. Uses default if not provided.
152
+ """
153
+ self._storage = storage or HistoryStorage()
154
+
155
+ @property
156
+ def storage(self) -> HistoryStorage:
157
+ """Get the underlying storage."""
158
+ return self._storage
159
+
160
+ def capture(self, dataset: Dataset) -> SchemaSnapshot:
161
+ """Capture current schema as a snapshot.
162
+
163
+ Args:
164
+ dataset: Dataset to capture schema from
165
+
166
+ Returns:
167
+ SchemaSnapshot representing current state
168
+ """
169
+ # Get schema information from the engine
170
+ columns = self._get_column_schemas(dataset)
171
+
172
+ snapshot = SchemaSnapshot(
173
+ source=dataset.source,
174
+ snapshot_id=str(uuid.uuid4()),
175
+ captured_at=datetime.now(),
176
+ columns=columns,
177
+ row_count=dataset.row_count,
178
+ )
179
+
180
+ # Store in database
181
+ self._store_snapshot(snapshot)
182
+
183
+ return snapshot
184
+
185
+ def get_history(
186
+ self,
187
+ source: str,
188
+ limit: int = 50,
189
+ ) -> list[SchemaSnapshot]:
190
+ """Get schema snapshot history for a source.
191
+
192
+ Args:
193
+ source: Data source path
194
+ limit: Maximum snapshots to return
195
+
196
+ Returns:
197
+ List of SchemaSnapshot objects, most recent first
198
+ """
199
+ conn = self._storage._get_connection()
200
+ cursor = conn.execute(QUERIES["get_schema_snapshots"], (source, limit))
201
+
202
+ return [self._row_to_snapshot(row) for row in cursor.fetchall()]
203
+
204
+ def get_latest(self, source: str) -> SchemaSnapshot | None:
205
+ """Get the most recent schema snapshot for a source.
206
+
207
+ Args:
208
+ source: Data source path
209
+
210
+ Returns:
211
+ SchemaSnapshot or None if no snapshots exist
212
+ """
213
+ conn = self._storage._get_connection()
214
+ cursor = conn.execute(QUERIES["get_latest_schema_snapshot"], (source,))
215
+ row = cursor.fetchone()
216
+
217
+ return self._row_to_snapshot(row) if row else None
218
+
219
+ def get_snapshot(self, snapshot_id: str) -> SchemaSnapshot | None:
220
+ """Get a specific snapshot by ID.
221
+
222
+ Args:
223
+ snapshot_id: Snapshot ID
224
+
225
+ Returns:
226
+ SchemaSnapshot or None if not found
227
+ """
228
+ conn = self._storage._get_connection()
229
+ cursor = conn.execute(QUERIES["get_schema_snapshot_by_id"], (snapshot_id,))
230
+ row = cursor.fetchone()
231
+
232
+ return self._row_to_snapshot(row) if row else None
233
+
234
+ def _get_column_schemas(self, dataset: Dataset) -> list[ColumnSchema]:
235
+ """Get column schemas from dataset."""
236
+ columns = []
237
+
238
+ # Get column info from DuckDB
239
+ ref = dataset.engine.get_source_reference(dataset.source)
240
+ result = dataset.engine.execute(f"DESCRIBE {ref}")
241
+
242
+ for i, row in enumerate(result.fetchall()):
243
+ col_name = row[0]
244
+ col_type = row[1]
245
+ nullable = row[2] == "YES" if len(row) > 2 else True
246
+
247
+ columns.append(ColumnSchema(
248
+ name=col_name,
249
+ dtype=col_type,
250
+ nullable=nullable,
251
+ position=i,
252
+ ))
253
+
254
+ return columns
255
+
256
+ def _store_snapshot(self, snapshot: SchemaSnapshot) -> None:
257
+ """Store a snapshot in the database."""
258
+ conn = self._storage._get_connection()
259
+
260
+ schema_json = json.dumps({
261
+ "columns": [c.to_dict() for c in snapshot.columns]
262
+ })
263
+
264
+ conn.execute(
265
+ QUERIES["insert_schema_snapshot"],
266
+ (
267
+ snapshot.source,
268
+ snapshot.snapshot_id,
269
+ snapshot.captured_at.isoformat(),
270
+ schema_json,
271
+ snapshot.column_count,
272
+ snapshot.row_count,
273
+ ),
274
+ )
275
+ conn.commit()
276
+
277
+ def _row_to_snapshot(self, row) -> SchemaSnapshot:
278
+ """Convert database row to SchemaSnapshot."""
279
+ schema_data = json.loads(row["schema_json"])
280
+ columns = [ColumnSchema.from_dict(c) for c in schema_data["columns"]]
281
+
282
+ return SchemaSnapshot(
283
+ source=row["source"],
284
+ snapshot_id=row["snapshot_id"],
285
+ captured_at=datetime.fromisoformat(row["captured_at"]),
286
+ columns=columns,
287
+ row_count=row["row_count"],
288
+ )
@@ -12,13 +12,13 @@ Example:
12
12
  print(result.confidence) # 0.95
13
13
  """
14
14
 
15
+ from duckguard.semantic.analyzer import SemanticAnalyzer
15
16
  from duckguard.semantic.detector import (
16
17
  SemanticType,
17
18
  SemanticTypeResult,
18
19
  detect_type,
19
20
  detect_types_for_dataset,
20
21
  )
21
- from duckguard.semantic.analyzer import SemanticAnalyzer
22
22
  from duckguard.semantic.validators import get_validator_for_type
23
23
 
24
24
  __all__ = [
@@ -12,9 +12,7 @@ from typing import Any
12
12
  from duckguard.core.dataset import Dataset
13
13
  from duckguard.semantic.detector import (
14
14
  SemanticType,
15
- SemanticTypeResult,
16
15
  SemanticTypeDetector,
17
- PII_TYPES,
18
16
  )
19
17
 
20
18
 
@@ -73,6 +73,7 @@ class SemanticType(Enum):
73
73
  TITLE = "title"
74
74
  SLUG = "slug"
75
75
  CODE = "code"
76
+ IDENTIFIER = "identifier"
76
77
 
77
78
  # Unknown
78
79
  UNKNOWN = "unknown"
@@ -216,6 +217,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
216
217
  SemanticType.CODE: [
217
218
  r"code", r".*_code$"
218
219
  ],
220
+ SemanticType.IDENTIFIER: [
221
+ r".*_id$", r".*_key$", r".*_code$", r".*_num(ber)?$", r".*_no$"
222
+ ],
219
223
  }
220
224
 
221
225
  # Value patterns for detection
@@ -235,6 +239,15 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
235
239
  SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
236
240
  SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
237
241
  SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
242
+ # Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
243
+ SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
244
+ }
245
+
246
+ # Patterns that must be matched case-sensitively (not using IGNORECASE)
247
+ CASE_SENSITIVE_PATTERNS = {
248
+ SemanticType.SLUG, # Slugs must be lowercase
249
+ SemanticType.IDENTIFIER, # Identifiers are typically uppercase
250
+ SemanticType.COUNTRY_CODE, # Country codes are uppercase
238
251
  }
239
252
 
240
253
  # PII types that should be flagged
@@ -269,6 +282,7 @@ TYPE_VALIDATIONS: dict[SemanticType, list[str]] = {
269
282
  SemanticType.LONGITUDE: ["range: [-180, 180]"],
270
283
  SemanticType.BOOLEAN: ["allowed_values: [true, false]"],
271
284
  SemanticType.COUNTRY_CODE: ["pattern: country_code"],
285
+ SemanticType.IDENTIFIER: ["not_null"],
272
286
  }
273
287
 
274
288
 
@@ -386,9 +400,11 @@ class SemanticTypeDetector:
386
400
  string_values = [str(v) for v in sample_values if v is not None]
387
401
  if string_values:
388
402
  for sem_type, pattern in self.value_patterns.items():
403
+ # Use case-sensitive matching for certain patterns
404
+ flags = 0 if sem_type in CASE_SENSITIVE_PATTERNS else re.IGNORECASE
389
405
  match_count = sum(
390
406
  1 for v in string_values[:50]
391
- if re.match(pattern, v, re.IGNORECASE)
407
+ if re.match(pattern, v, flags)
392
408
  )
393
409
  match_rate = match_count / min(len(string_values), 50)
394
410
 
@@ -6,8 +6,9 @@ Provides validation functions specific to each semantic type.
6
6
  from __future__ import annotations
7
7
 
8
8
  import re
9
+ from collections.abc import Callable
9
10
  from dataclasses import dataclass
10
- from typing import Any, Callable
11
+ from typing import Any
11
12
 
12
13
  from duckguard.semantic.detector import SemanticType
13
14