duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Schema tracking implementation.
|
|
2
|
+
|
|
3
|
+
Provides functionality to capture and store schema snapshots over time.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import uuid
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
from duckguard.history.schema import QUERIES
|
|
15
|
+
from duckguard.history.storage import HistoryStorage
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from duckguard.core.dataset import Dataset
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ColumnSchema:
|
|
23
|
+
"""Represents the schema of a single column.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
name: Column name
|
|
27
|
+
dtype: Data type as string
|
|
28
|
+
nullable: Whether the column allows nulls
|
|
29
|
+
position: Position in the table (0-indexed)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name: str
|
|
33
|
+
dtype: str
|
|
34
|
+
nullable: bool
|
|
35
|
+
position: int
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict[str, Any]:
|
|
38
|
+
"""Convert to dictionary."""
|
|
39
|
+
return {
|
|
40
|
+
"name": self.name,
|
|
41
|
+
"dtype": self.dtype,
|
|
42
|
+
"nullable": self.nullable,
|
|
43
|
+
"position": self.position,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_dict(cls, data: dict[str, Any]) -> ColumnSchema:
|
|
48
|
+
"""Create from dictionary."""
|
|
49
|
+
return cls(
|
|
50
|
+
name=data["name"],
|
|
51
|
+
dtype=data["dtype"],
|
|
52
|
+
nullable=data.get("nullable", True),
|
|
53
|
+
position=data.get("position", 0),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class SchemaSnapshot:
|
|
59
|
+
"""Represents a captured schema at a point in time.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
source: Data source path
|
|
63
|
+
snapshot_id: Unique identifier for this snapshot
|
|
64
|
+
captured_at: When the snapshot was captured
|
|
65
|
+
columns: List of column schemas
|
|
66
|
+
row_count: Optional row count at capture time
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
source: str
|
|
70
|
+
snapshot_id: str
|
|
71
|
+
captured_at: datetime
|
|
72
|
+
columns: list[ColumnSchema]
|
|
73
|
+
row_count: int | None = None
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def column_count(self) -> int:
|
|
77
|
+
"""Get the number of columns."""
|
|
78
|
+
return len(self.columns)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def column_names(self) -> list[str]:
|
|
82
|
+
"""Get list of column names."""
|
|
83
|
+
return [c.name for c in self.columns]
|
|
84
|
+
|
|
85
|
+
def get_column(self, name: str) -> ColumnSchema | None:
|
|
86
|
+
"""Get a column by name."""
|
|
87
|
+
for col in self.columns:
|
|
88
|
+
if col.name == name:
|
|
89
|
+
return col
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
def to_dict(self) -> dict[str, Any]:
|
|
93
|
+
"""Convert to dictionary for JSON serialization."""
|
|
94
|
+
return {
|
|
95
|
+
"source": self.source,
|
|
96
|
+
"snapshot_id": self.snapshot_id,
|
|
97
|
+
"captured_at": self.captured_at.isoformat(),
|
|
98
|
+
"columns": [c.to_dict() for c in self.columns],
|
|
99
|
+
"row_count": self.row_count,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_dict(cls, data: dict[str, Any]) -> SchemaSnapshot:
|
|
104
|
+
"""Create from dictionary."""
|
|
105
|
+
return cls(
|
|
106
|
+
source=data["source"],
|
|
107
|
+
snapshot_id=data["snapshot_id"],
|
|
108
|
+
captured_at=datetime.fromisoformat(data["captured_at"]),
|
|
109
|
+
columns=[ColumnSchema.from_dict(c) for c in data["columns"]],
|
|
110
|
+
row_count=data.get("row_count"),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def __eq__(self, other: object) -> bool:
|
|
114
|
+
"""Check schema equality (ignores snapshot_id and captured_at)."""
|
|
115
|
+
if not isinstance(other, SchemaSnapshot):
|
|
116
|
+
return False
|
|
117
|
+
return (
|
|
118
|
+
self.source == other.source
|
|
119
|
+
and len(self.columns) == len(other.columns)
|
|
120
|
+
and all(
|
|
121
|
+
c1.name == c2.name and c1.dtype == c2.dtype and c1.nullable == c2.nullable
|
|
122
|
+
for c1, c2 in zip(self.columns, other.columns)
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class SchemaTracker:
|
|
128
|
+
"""Track schema changes over time.
|
|
129
|
+
|
|
130
|
+
Usage:
|
|
131
|
+
from duckguard import connect
|
|
132
|
+
from duckguard.schema_history import SchemaTracker
|
|
133
|
+
|
|
134
|
+
tracker = SchemaTracker()
|
|
135
|
+
data = connect("data.csv")
|
|
136
|
+
|
|
137
|
+
# Capture current schema
|
|
138
|
+
snapshot = tracker.capture(data)
|
|
139
|
+
|
|
140
|
+
# Get history
|
|
141
|
+
history = tracker.get_history(data.source)
|
|
142
|
+
|
|
143
|
+
# Get latest snapshot
|
|
144
|
+
latest = tracker.get_latest(data.source)
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(self, storage: HistoryStorage | None = None):
|
|
148
|
+
"""Initialize schema tracker.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
storage: Optional HistoryStorage instance. Uses default if not provided.
|
|
152
|
+
"""
|
|
153
|
+
self._storage = storage or HistoryStorage()
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def storage(self) -> HistoryStorage:
|
|
157
|
+
"""Get the underlying storage."""
|
|
158
|
+
return self._storage
|
|
159
|
+
|
|
160
|
+
def capture(self, dataset: Dataset) -> SchemaSnapshot:
|
|
161
|
+
"""Capture current schema as a snapshot.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
dataset: Dataset to capture schema from
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
SchemaSnapshot representing current state
|
|
168
|
+
"""
|
|
169
|
+
# Get schema information from the engine
|
|
170
|
+
columns = self._get_column_schemas(dataset)
|
|
171
|
+
|
|
172
|
+
snapshot = SchemaSnapshot(
|
|
173
|
+
source=dataset.source,
|
|
174
|
+
snapshot_id=str(uuid.uuid4()),
|
|
175
|
+
captured_at=datetime.now(),
|
|
176
|
+
columns=columns,
|
|
177
|
+
row_count=dataset.row_count,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Store in database
|
|
181
|
+
self._store_snapshot(snapshot)
|
|
182
|
+
|
|
183
|
+
return snapshot
|
|
184
|
+
|
|
185
|
+
def get_history(
|
|
186
|
+
self,
|
|
187
|
+
source: str,
|
|
188
|
+
limit: int = 50,
|
|
189
|
+
) -> list[SchemaSnapshot]:
|
|
190
|
+
"""Get schema snapshot history for a source.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
source: Data source path
|
|
194
|
+
limit: Maximum snapshots to return
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
List of SchemaSnapshot objects, most recent first
|
|
198
|
+
"""
|
|
199
|
+
conn = self._storage._get_connection()
|
|
200
|
+
cursor = conn.execute(QUERIES["get_schema_snapshots"], (source, limit))
|
|
201
|
+
|
|
202
|
+
return [self._row_to_snapshot(row) for row in cursor.fetchall()]
|
|
203
|
+
|
|
204
|
+
def get_latest(self, source: str) -> SchemaSnapshot | None:
|
|
205
|
+
"""Get the most recent schema snapshot for a source.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
source: Data source path
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
SchemaSnapshot or None if no snapshots exist
|
|
212
|
+
"""
|
|
213
|
+
conn = self._storage._get_connection()
|
|
214
|
+
cursor = conn.execute(QUERIES["get_latest_schema_snapshot"], (source,))
|
|
215
|
+
row = cursor.fetchone()
|
|
216
|
+
|
|
217
|
+
return self._row_to_snapshot(row) if row else None
|
|
218
|
+
|
|
219
|
+
def get_snapshot(self, snapshot_id: str) -> SchemaSnapshot | None:
|
|
220
|
+
"""Get a specific snapshot by ID.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
snapshot_id: Snapshot ID
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
SchemaSnapshot or None if not found
|
|
227
|
+
"""
|
|
228
|
+
conn = self._storage._get_connection()
|
|
229
|
+
cursor = conn.execute(QUERIES["get_schema_snapshot_by_id"], (snapshot_id,))
|
|
230
|
+
row = cursor.fetchone()
|
|
231
|
+
|
|
232
|
+
return self._row_to_snapshot(row) if row else None
|
|
233
|
+
|
|
234
|
+
def _get_column_schemas(self, dataset: Dataset) -> list[ColumnSchema]:
|
|
235
|
+
"""Get column schemas from dataset."""
|
|
236
|
+
columns = []
|
|
237
|
+
|
|
238
|
+
# Get column info from DuckDB
|
|
239
|
+
ref = dataset.engine.get_source_reference(dataset.source)
|
|
240
|
+
result = dataset.engine.execute(f"DESCRIBE {ref}")
|
|
241
|
+
|
|
242
|
+
for i, row in enumerate(result.fetchall()):
|
|
243
|
+
col_name = row[0]
|
|
244
|
+
col_type = row[1]
|
|
245
|
+
nullable = row[2] == "YES" if len(row) > 2 else True
|
|
246
|
+
|
|
247
|
+
columns.append(ColumnSchema(
|
|
248
|
+
name=col_name,
|
|
249
|
+
dtype=col_type,
|
|
250
|
+
nullable=nullable,
|
|
251
|
+
position=i,
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
return columns
|
|
255
|
+
|
|
256
|
+
def _store_snapshot(self, snapshot: SchemaSnapshot) -> None:
|
|
257
|
+
"""Store a snapshot in the database."""
|
|
258
|
+
conn = self._storage._get_connection()
|
|
259
|
+
|
|
260
|
+
schema_json = json.dumps({
|
|
261
|
+
"columns": [c.to_dict() for c in snapshot.columns]
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
conn.execute(
|
|
265
|
+
QUERIES["insert_schema_snapshot"],
|
|
266
|
+
(
|
|
267
|
+
snapshot.source,
|
|
268
|
+
snapshot.snapshot_id,
|
|
269
|
+
snapshot.captured_at.isoformat(),
|
|
270
|
+
schema_json,
|
|
271
|
+
snapshot.column_count,
|
|
272
|
+
snapshot.row_count,
|
|
273
|
+
),
|
|
274
|
+
)
|
|
275
|
+
conn.commit()
|
|
276
|
+
|
|
277
|
+
def _row_to_snapshot(self, row) -> SchemaSnapshot:
|
|
278
|
+
"""Convert database row to SchemaSnapshot."""
|
|
279
|
+
schema_data = json.loads(row["schema_json"])
|
|
280
|
+
columns = [ColumnSchema.from_dict(c) for c in schema_data["columns"]]
|
|
281
|
+
|
|
282
|
+
return SchemaSnapshot(
|
|
283
|
+
source=row["source"],
|
|
284
|
+
snapshot_id=row["snapshot_id"],
|
|
285
|
+
captured_at=datetime.fromisoformat(row["captured_at"]),
|
|
286
|
+
columns=columns,
|
|
287
|
+
row_count=row["row_count"],
|
|
288
|
+
)
|
duckguard/semantic/__init__.py
CHANGED
|
@@ -12,13 +12,13 @@ Example:
|
|
|
12
12
|
print(result.confidence) # 0.95
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
+
from duckguard.semantic.analyzer import SemanticAnalyzer
|
|
15
16
|
from duckguard.semantic.detector import (
|
|
16
17
|
SemanticType,
|
|
17
18
|
SemanticTypeResult,
|
|
18
19
|
detect_type,
|
|
19
20
|
detect_types_for_dataset,
|
|
20
21
|
)
|
|
21
|
-
from duckguard.semantic.analyzer import SemanticAnalyzer
|
|
22
22
|
from duckguard.semantic.validators import get_validator_for_type
|
|
23
23
|
|
|
24
24
|
__all__ = [
|
duckguard/semantic/analyzer.py
CHANGED
duckguard/semantic/detector.py
CHANGED
|
@@ -73,6 +73,7 @@ class SemanticType(Enum):
|
|
|
73
73
|
TITLE = "title"
|
|
74
74
|
SLUG = "slug"
|
|
75
75
|
CODE = "code"
|
|
76
|
+
IDENTIFIER = "identifier"
|
|
76
77
|
|
|
77
78
|
# Unknown
|
|
78
79
|
UNKNOWN = "unknown"
|
|
@@ -216,6 +217,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
|
|
|
216
217
|
SemanticType.CODE: [
|
|
217
218
|
r"code", r".*_code$"
|
|
218
219
|
],
|
|
220
|
+
SemanticType.IDENTIFIER: [
|
|
221
|
+
r".*_id$", r".*_key$", r".*_code$", r".*_num(ber)?$", r".*_no$"
|
|
222
|
+
],
|
|
219
223
|
}
|
|
220
224
|
|
|
221
225
|
# Value patterns for detection
|
|
@@ -235,6 +239,15 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
|
|
|
235
239
|
SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
236
240
|
SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
|
|
237
241
|
SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
|
|
242
|
+
# Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
|
|
243
|
+
SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Patterns that must be matched case-sensitively (not using IGNORECASE)
|
|
247
|
+
CASE_SENSITIVE_PATTERNS = {
|
|
248
|
+
SemanticType.SLUG, # Slugs must be lowercase
|
|
249
|
+
SemanticType.IDENTIFIER, # Identifiers are typically uppercase
|
|
250
|
+
SemanticType.COUNTRY_CODE, # Country codes are uppercase
|
|
238
251
|
}
|
|
239
252
|
|
|
240
253
|
# PII types that should be flagged
|
|
@@ -269,6 +282,7 @@ TYPE_VALIDATIONS: dict[SemanticType, list[str]] = {
|
|
|
269
282
|
SemanticType.LONGITUDE: ["range: [-180, 180]"],
|
|
270
283
|
SemanticType.BOOLEAN: ["allowed_values: [true, false]"],
|
|
271
284
|
SemanticType.COUNTRY_CODE: ["pattern: country_code"],
|
|
285
|
+
SemanticType.IDENTIFIER: ["not_null"],
|
|
272
286
|
}
|
|
273
287
|
|
|
274
288
|
|
|
@@ -386,9 +400,11 @@ class SemanticTypeDetector:
|
|
|
386
400
|
string_values = [str(v) for v in sample_values if v is not None]
|
|
387
401
|
if string_values:
|
|
388
402
|
for sem_type, pattern in self.value_patterns.items():
|
|
403
|
+
# Use case-sensitive matching for certain patterns
|
|
404
|
+
flags = 0 if sem_type in CASE_SENSITIVE_PATTERNS else re.IGNORECASE
|
|
389
405
|
match_count = sum(
|
|
390
406
|
1 for v in string_values[:50]
|
|
391
|
-
if re.match(pattern, v,
|
|
407
|
+
if re.match(pattern, v, flags)
|
|
392
408
|
)
|
|
393
409
|
match_rate = match_count / min(len(string_values), 50)
|
|
394
410
|
|
duckguard/semantic/validators.py
CHANGED
|
@@ -6,8 +6,9 @@ Provides validation functions specific to each semantic type.
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
import re
|
|
9
|
+
from collections.abc import Callable
|
|
9
10
|
from dataclasses import dataclass
|
|
10
|
-
from typing import Any
|
|
11
|
+
from typing import Any
|
|
11
12
|
|
|
12
13
|
from duckguard.semantic.detector import SemanticType
|
|
13
14
|
|