duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Apache Kafka connector for streaming data quality."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import parse_qs, urlparse
|
|
8
|
+
|
|
9
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
10
|
+
from duckguard.core.dataset import Dataset
|
|
11
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class KafkaConnector(Connector):
|
|
15
|
+
"""
|
|
16
|
+
Connector for Apache Kafka topics.
|
|
17
|
+
|
|
18
|
+
Consumes messages from a Kafka topic and validates them.
|
|
19
|
+
Supports JSON, Avro, and string message formats.
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
# Using connection string
|
|
23
|
+
data = connect(
|
|
24
|
+
"kafka://broker1:9092,broker2:9092/my-topic",
|
|
25
|
+
sample_size=1000
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Using options
|
|
29
|
+
data = connect(
|
|
30
|
+
"kafka://localhost:9092",
|
|
31
|
+
table="my-topic", # topic name
|
|
32
|
+
group_id="duckguard-validator",
|
|
33
|
+
sample_size=5000,
|
|
34
|
+
format="json"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# With authentication
|
|
38
|
+
data = connect(
|
|
39
|
+
"kafka://broker:9092/topic",
|
|
40
|
+
security_protocol="SASL_SSL",
|
|
41
|
+
sasl_mechanism="PLAIN",
|
|
42
|
+
sasl_username="user",
|
|
43
|
+
sasl_password="pass"
|
|
44
|
+
)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
48
|
+
super().__init__(engine)
|
|
49
|
+
self._consumer = None
|
|
50
|
+
|
|
51
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
52
|
+
"""
|
|
53
|
+
Connect to Kafka and return a Dataset.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
config: Connection configuration
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Dataset object
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
from kafka import KafkaConsumer
|
|
63
|
+
except ImportError:
|
|
64
|
+
raise ImportError(
|
|
65
|
+
"Kafka support requires kafka-python. "
|
|
66
|
+
"Install with: pip install duckguard[kafka]"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Parse connection parameters
|
|
70
|
+
conn_params = self._parse_connection_string(config.source, config)
|
|
71
|
+
|
|
72
|
+
topic = config.table or conn_params.get("topic")
|
|
73
|
+
if not topic:
|
|
74
|
+
raise ValueError("Topic name is required for Kafka connections")
|
|
75
|
+
|
|
76
|
+
bootstrap_servers = conn_params.get("bootstrap_servers", "localhost:9092")
|
|
77
|
+
group_id = conn_params.get("group_id", "duckguard-validator")
|
|
78
|
+
sample_size = conn_params.get("sample_size", 1000)
|
|
79
|
+
message_format = conn_params.get("format", "json")
|
|
80
|
+
|
|
81
|
+
# Build consumer config
|
|
82
|
+
consumer_config = {
|
|
83
|
+
"bootstrap_servers": bootstrap_servers,
|
|
84
|
+
"group_id": group_id,
|
|
85
|
+
"auto_offset_reset": "earliest",
|
|
86
|
+
"enable_auto_commit": False,
|
|
87
|
+
"consumer_timeout_ms": conn_params.get("timeout_ms", 10000),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Add security config if present
|
|
91
|
+
if conn_params.get("security_protocol"):
|
|
92
|
+
consumer_config["security_protocol"] = conn_params["security_protocol"]
|
|
93
|
+
if conn_params.get("sasl_mechanism"):
|
|
94
|
+
consumer_config["sasl_mechanism"] = conn_params["sasl_mechanism"]
|
|
95
|
+
if conn_params.get("sasl_username"):
|
|
96
|
+
consumer_config["sasl_plain_username"] = conn_params["sasl_username"]
|
|
97
|
+
if conn_params.get("sasl_password"):
|
|
98
|
+
consumer_config["sasl_plain_password"] = conn_params["sasl_password"]
|
|
99
|
+
|
|
100
|
+
# Create consumer
|
|
101
|
+
self._consumer = KafkaConsumer(topic, **consumer_config)
|
|
102
|
+
|
|
103
|
+
return KafkaDataset(
|
|
104
|
+
source=topic,
|
|
105
|
+
engine=self.engine,
|
|
106
|
+
name=topic,
|
|
107
|
+
consumer=self._consumer,
|
|
108
|
+
sample_size=sample_size,
|
|
109
|
+
message_format=message_format,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
|
|
113
|
+
"""Parse Kafka connection string."""
|
|
114
|
+
params: dict[str, Any] = {}
|
|
115
|
+
|
|
116
|
+
# Parse URL format: kafka://broker1:9092,broker2:9092/topic
|
|
117
|
+
if conn_string.lower().startswith("kafka://"):
|
|
118
|
+
# Remove protocol
|
|
119
|
+
rest = conn_string[8:]
|
|
120
|
+
|
|
121
|
+
# Split path
|
|
122
|
+
if "/" in rest:
|
|
123
|
+
brokers_part, path = rest.split("/", 1)
|
|
124
|
+
params["topic"] = path.split("?")[0] if path else None
|
|
125
|
+
else:
|
|
126
|
+
brokers_part = rest.split("?")[0]
|
|
127
|
+
|
|
128
|
+
params["bootstrap_servers"] = brokers_part
|
|
129
|
+
|
|
130
|
+
# Parse query parameters
|
|
131
|
+
parsed = urlparse(conn_string)
|
|
132
|
+
if parsed.query:
|
|
133
|
+
query_params = parse_qs(parsed.query)
|
|
134
|
+
for key, values in query_params.items():
|
|
135
|
+
params[key] = values[0] if len(values) == 1 else values
|
|
136
|
+
|
|
137
|
+
# Override with config options
|
|
138
|
+
options = config.options or {}
|
|
139
|
+
for key in [
|
|
140
|
+
"bootstrap_servers",
|
|
141
|
+
"group_id",
|
|
142
|
+
"sample_size",
|
|
143
|
+
"format",
|
|
144
|
+
"timeout_ms",
|
|
145
|
+
"security_protocol",
|
|
146
|
+
"sasl_mechanism",
|
|
147
|
+
"sasl_username",
|
|
148
|
+
"sasl_password",
|
|
149
|
+
]:
|
|
150
|
+
if key in options:
|
|
151
|
+
params[key] = options[key]
|
|
152
|
+
|
|
153
|
+
if config.table:
|
|
154
|
+
params["topic"] = config.table
|
|
155
|
+
|
|
156
|
+
return params
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def can_handle(cls, source: str) -> bool:
|
|
160
|
+
"""Check if this is a Kafka connection string."""
|
|
161
|
+
return source.lower().startswith("kafka://")
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def get_priority(cls) -> int:
|
|
165
|
+
"""Kafka connector has high priority."""
|
|
166
|
+
return 55
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class KafkaDataset(Dataset):
|
|
170
|
+
"""
|
|
171
|
+
Dataset that consumes from Kafka topic.
|
|
172
|
+
|
|
173
|
+
Samples messages and loads them into DuckDB for validation.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
source: str,
|
|
179
|
+
engine: DuckGuardEngine,
|
|
180
|
+
name: str,
|
|
181
|
+
consumer: Any,
|
|
182
|
+
sample_size: int = 1000,
|
|
183
|
+
message_format: str = "json",
|
|
184
|
+
):
|
|
185
|
+
super().__init__(source=source, engine=engine, name=name)
|
|
186
|
+
self._consumer = consumer
|
|
187
|
+
self._sample_size = sample_size
|
|
188
|
+
self._message_format = message_format
|
|
189
|
+
self._loaded = False
|
|
190
|
+
self._view_name = f"_duckguard_kafka_{name.replace('-', '_')}"
|
|
191
|
+
self._messages_consumed = 0
|
|
192
|
+
|
|
193
|
+
def _ensure_loaded(self) -> None:
|
|
194
|
+
"""Consume messages and load into DuckDB if not already done."""
|
|
195
|
+
if self._loaded:
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
import pandas as pd
|
|
200
|
+
except ImportError:
|
|
201
|
+
raise ImportError(
|
|
202
|
+
"Kafka connector requires pandas for data loading. "
|
|
203
|
+
"Install with: pip install pandas"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
messages = []
|
|
207
|
+
count = 0
|
|
208
|
+
|
|
209
|
+
# Consume messages
|
|
210
|
+
for message in self._consumer:
|
|
211
|
+
try:
|
|
212
|
+
if self._message_format == "json":
|
|
213
|
+
value = json.loads(message.value.decode("utf-8"))
|
|
214
|
+
else:
|
|
215
|
+
value = {"value": message.value.decode("utf-8")}
|
|
216
|
+
|
|
217
|
+
# Add metadata
|
|
218
|
+
value["_kafka_topic"] = message.topic
|
|
219
|
+
value["_kafka_partition"] = message.partition
|
|
220
|
+
value["_kafka_offset"] = message.offset
|
|
221
|
+
value["_kafka_timestamp"] = message.timestamp
|
|
222
|
+
|
|
223
|
+
messages.append(value)
|
|
224
|
+
count += 1
|
|
225
|
+
|
|
226
|
+
if count >= self._sample_size:
|
|
227
|
+
break
|
|
228
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
229
|
+
# Skip malformed messages but track them
|
|
230
|
+
messages.append(
|
|
231
|
+
{
|
|
232
|
+
"_kafka_topic": message.topic,
|
|
233
|
+
"_kafka_partition": message.partition,
|
|
234
|
+
"_kafka_offset": message.offset,
|
|
235
|
+
"_kafka_error": str(e),
|
|
236
|
+
}
|
|
237
|
+
)
|
|
238
|
+
count += 1
|
|
239
|
+
|
|
240
|
+
self._messages_consumed = count
|
|
241
|
+
|
|
242
|
+
if not messages:
|
|
243
|
+
df = pd.DataFrame()
|
|
244
|
+
else:
|
|
245
|
+
df = pd.json_normalize(messages)
|
|
246
|
+
|
|
247
|
+
# Register with DuckDB
|
|
248
|
+
self._engine.conn.register(self._view_name, df)
|
|
249
|
+
self._source = self._view_name
|
|
250
|
+
self._loaded = True
|
|
251
|
+
|
|
252
|
+
# Close consumer
|
|
253
|
+
self._consumer.close()
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def row_count(self) -> int:
|
|
257
|
+
"""Get number of messages consumed."""
|
|
258
|
+
self._ensure_loaded()
|
|
259
|
+
return self._messages_consumed
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def columns(self) -> list[str]:
|
|
263
|
+
"""Get column names from consumed messages."""
|
|
264
|
+
if self._columns_cache is None:
|
|
265
|
+
self._ensure_loaded()
|
|
266
|
+
self._columns_cache = self._engine.get_columns(self._view_name)
|
|
267
|
+
return self._columns_cache
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def messages_consumed(self) -> int:
|
|
271
|
+
"""Get the actual number of messages consumed."""
|
|
272
|
+
self._ensure_loaded()
|
|
273
|
+
return self._messages_consumed
|
|
274
|
+
|
|
275
|
+
@property
|
|
276
|
+
def parse_error_count(self) -> int:
|
|
277
|
+
"""Get count of messages that failed to parse."""
|
|
278
|
+
self._ensure_loaded()
|
|
279
|
+
sql = f"SELECT COUNT(*) FROM {self._view_name} WHERE _kafka_error IS NOT NULL"
|
|
280
|
+
return self._engine.fetch_value(sql) or 0
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class KafkaStreamValidator:
|
|
284
|
+
"""
|
|
285
|
+
Continuous streaming validator for Kafka.
|
|
286
|
+
|
|
287
|
+
Validates messages in real-time as they arrive.
|
|
288
|
+
|
|
289
|
+
Example:
|
|
290
|
+
validator = KafkaStreamValidator(
|
|
291
|
+
"kafka://localhost:9092/orders",
|
|
292
|
+
rules=[
|
|
293
|
+
lambda msg: msg.get("amount", 0) > 0,
|
|
294
|
+
lambda msg: msg.get("customer_id") is not None,
|
|
295
|
+
]
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Start validation (blocks)
|
|
299
|
+
validator.start()
|
|
300
|
+
|
|
301
|
+
# Or get validation results
|
|
302
|
+
for result in validator.validate_stream():
|
|
303
|
+
if not result.passed:
|
|
304
|
+
print(f"Validation failed: {result.message}")
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
def __init__(
|
|
308
|
+
self,
|
|
309
|
+
source: str,
|
|
310
|
+
rules: list[callable] | None = None,
|
|
311
|
+
**options: Any,
|
|
312
|
+
):
|
|
313
|
+
self.source = source
|
|
314
|
+
self.rules = rules or []
|
|
315
|
+
self.options = options
|
|
316
|
+
self._consumer = None
|
|
317
|
+
self._stats = {
|
|
318
|
+
"messages_processed": 0,
|
|
319
|
+
"messages_passed": 0,
|
|
320
|
+
"messages_failed": 0,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
def add_rule(self, rule: callable) -> "KafkaStreamValidator":
|
|
324
|
+
"""Add a validation rule."""
|
|
325
|
+
self.rules.append(rule)
|
|
326
|
+
return self
|
|
327
|
+
|
|
328
|
+
def validate_message(self, message: dict) -> tuple[bool, list[str]]:
|
|
329
|
+
"""Validate a single message against all rules."""
|
|
330
|
+
failures = []
|
|
331
|
+
for i, rule in enumerate(self.rules):
|
|
332
|
+
try:
|
|
333
|
+
if not rule(message):
|
|
334
|
+
failures.append(f"Rule {i + 1} failed")
|
|
335
|
+
except Exception as e:
|
|
336
|
+
failures.append(f"Rule {i + 1} error: {e}")
|
|
337
|
+
|
|
338
|
+
return len(failures) == 0, failures
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def stats(self) -> dict[str, int]:
|
|
342
|
+
"""Get validation statistics."""
|
|
343
|
+
return self._stats.copy()
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""MongoDB connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
9
|
+
from duckguard.core.dataset import Dataset
|
|
10
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MongoDBConnector(Connector):
|
|
14
|
+
"""
|
|
15
|
+
Connector for MongoDB.
|
|
16
|
+
|
|
17
|
+
Uses pymongo for connectivity. Converts MongoDB collections to
|
|
18
|
+
a tabular format for validation using DuckDB.
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
# Using connection string
|
|
22
|
+
data = connect(
|
|
23
|
+
"mongodb://user:pass@host:27017/database",
|
|
24
|
+
table="orders" # collection name
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Using MongoDB Atlas
|
|
28
|
+
data = connect(
|
|
29
|
+
"mongodb+srv://user:pass@cluster.mongodb.net/database",
|
|
30
|
+
table="orders"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Using options
|
|
34
|
+
data = connect(
|
|
35
|
+
"mongodb://host:27017",
|
|
36
|
+
table="orders",
|
|
37
|
+
database="mydb",
|
|
38
|
+
sample_size=10000 # Sample for large collections
|
|
39
|
+
)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
43
|
+
super().__init__(engine)
|
|
44
|
+
self._client = None
|
|
45
|
+
self._db = None
|
|
46
|
+
|
|
47
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
48
|
+
"""
|
|
49
|
+
Connect to MongoDB and return a Dataset.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: Connection configuration
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dataset object
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
from pymongo import MongoClient
|
|
59
|
+
except ImportError:
|
|
60
|
+
raise ImportError(
|
|
61
|
+
"MongoDB support requires pymongo. "
|
|
62
|
+
"Install with: pip install duckguard[mongodb]"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if not config.table:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Collection name is required for MongoDB connections (use table parameter)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Parse connection parameters
|
|
71
|
+
conn_params = self._parse_connection_string(config.source, config)
|
|
72
|
+
|
|
73
|
+
# Connect to MongoDB
|
|
74
|
+
self._client = MongoClient(conn_params["connection_string"])
|
|
75
|
+
|
|
76
|
+
database_name = config.database or conn_params.get("database")
|
|
77
|
+
if not database_name:
|
|
78
|
+
raise ValueError("Database name is required for MongoDB connections")
|
|
79
|
+
|
|
80
|
+
self._db = self._client[database_name]
|
|
81
|
+
collection_name = config.table
|
|
82
|
+
|
|
83
|
+
# Get sample size from options
|
|
84
|
+
sample_size = (config.options or {}).get("sample_size", 100000)
|
|
85
|
+
|
|
86
|
+
return MongoDBDataset(
|
|
87
|
+
source=collection_name,
|
|
88
|
+
engine=self.engine,
|
|
89
|
+
name=collection_name,
|
|
90
|
+
database=self._db,
|
|
91
|
+
collection_name=collection_name,
|
|
92
|
+
sample_size=sample_size,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
|
|
96
|
+
"""Parse MongoDB connection string."""
|
|
97
|
+
params: dict[str, Any] = {}
|
|
98
|
+
|
|
99
|
+
# Keep the full connection string for pymongo
|
|
100
|
+
params["connection_string"] = conn_string
|
|
101
|
+
|
|
102
|
+
# Parse to extract database name if present
|
|
103
|
+
if conn_string.lower().startswith(("mongodb://", "mongodb+srv://")):
|
|
104
|
+
parsed = urlparse(conn_string)
|
|
105
|
+
if parsed.path and parsed.path != "/":
|
|
106
|
+
params["database"] = parsed.path.lstrip("/").split("?")[0]
|
|
107
|
+
|
|
108
|
+
# Override with config options
|
|
109
|
+
if config.database:
|
|
110
|
+
params["database"] = config.database
|
|
111
|
+
|
|
112
|
+
return params
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def can_handle(cls, source: str) -> bool:
|
|
116
|
+
"""Check if this is a MongoDB connection string."""
|
|
117
|
+
source_lower = source.lower()
|
|
118
|
+
return source_lower.startswith(("mongodb://", "mongodb+srv://"))
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def get_priority(cls) -> int:
|
|
122
|
+
"""MongoDB connector has high priority."""
|
|
123
|
+
return 55
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class MongoDBDataset(Dataset):
|
|
127
|
+
"""
|
|
128
|
+
Dataset that queries MongoDB.
|
|
129
|
+
|
|
130
|
+
Loads data from MongoDB collection into DuckDB for validation.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
source: str,
|
|
136
|
+
engine: DuckGuardEngine,
|
|
137
|
+
name: str,
|
|
138
|
+
database: Any,
|
|
139
|
+
collection_name: str,
|
|
140
|
+
sample_size: int = 100000,
|
|
141
|
+
):
|
|
142
|
+
super().__init__(source=source, engine=engine, name=name)
|
|
143
|
+
self._database = database
|
|
144
|
+
self._collection_name = collection_name
|
|
145
|
+
self._sample_size = sample_size
|
|
146
|
+
self._collection = database[collection_name]
|
|
147
|
+
self._loaded = False
|
|
148
|
+
self._view_name = f"_duckguard_mongo_{collection_name}"
|
|
149
|
+
|
|
150
|
+
def _ensure_loaded(self) -> None:
|
|
151
|
+
"""Load MongoDB data into DuckDB if not already loaded."""
|
|
152
|
+
if self._loaded:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
import pandas as pd
|
|
157
|
+
except ImportError:
|
|
158
|
+
raise ImportError(
|
|
159
|
+
"MongoDB connector requires pandas for data loading. "
|
|
160
|
+
"Install with: pip install pandas"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Get documents from MongoDB
|
|
164
|
+
cursor = self._collection.find().limit(self._sample_size)
|
|
165
|
+
documents = list(cursor)
|
|
166
|
+
|
|
167
|
+
if not documents:
|
|
168
|
+
# Create empty dataframe with no columns
|
|
169
|
+
df = pd.DataFrame()
|
|
170
|
+
else:
|
|
171
|
+
# Flatten nested documents and convert to DataFrame
|
|
172
|
+
df = pd.json_normalize(documents)
|
|
173
|
+
|
|
174
|
+
# Convert ObjectId to string
|
|
175
|
+
if "_id" in df.columns:
|
|
176
|
+
df["_id"] = df["_id"].astype(str)
|
|
177
|
+
|
|
178
|
+
# Register with DuckDB
|
|
179
|
+
self._engine.conn.register(self._view_name, df)
|
|
180
|
+
self._source = self._view_name
|
|
181
|
+
self._loaded = True
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def row_count(self) -> int:
|
|
185
|
+
"""Get row count."""
|
|
186
|
+
if self._row_count_cache is None:
|
|
187
|
+
# Use MongoDB count for accuracy
|
|
188
|
+
self._row_count_cache = self._collection.count_documents({})
|
|
189
|
+
return self._row_count_cache
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def columns(self) -> list[str]:
|
|
193
|
+
"""Get column names (field names from documents)."""
|
|
194
|
+
if self._columns_cache is None:
|
|
195
|
+
self._ensure_loaded()
|
|
196
|
+
self._columns_cache = self._engine.get_columns(self._view_name)
|
|
197
|
+
return self._columns_cache
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def sample_row_count(self) -> int:
|
|
201
|
+
"""Get the number of rows in the sample (may be less than total)."""
|
|
202
|
+
self._ensure_loaded()
|
|
203
|
+
return self._engine.get_row_count(self._view_name)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class MongoDBColumn:
|
|
207
|
+
"""Column for MongoDB datasets with document-aware validation."""
|
|
208
|
+
|
|
209
|
+
def __init__(self, name: str, dataset: MongoDBDataset):
|
|
210
|
+
self._name = name
|
|
211
|
+
self._dataset = dataset
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def null_percent(self) -> float:
|
|
215
|
+
"""Get null/missing percentage."""
|
|
216
|
+
self._dataset._ensure_loaded()
|
|
217
|
+
stats = self._dataset._engine.get_column_stats(
|
|
218
|
+
self._dataset._view_name, self._name
|
|
219
|
+
)
|
|
220
|
+
return stats.get("null_percent", 0.0)
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def field_exists_percent(self) -> float:
|
|
224
|
+
"""
|
|
225
|
+
Get percentage of documents that have this field.
|
|
226
|
+
|
|
227
|
+
This is MongoDB-specific - checks for field existence.
|
|
228
|
+
"""
|
|
229
|
+
total = self._dataset._collection.count_documents({})
|
|
230
|
+
if total == 0:
|
|
231
|
+
return 100.0
|
|
232
|
+
|
|
233
|
+
with_field = self._dataset._collection.count_documents(
|
|
234
|
+
{self._name: {"$exists": True}}
|
|
235
|
+
)
|
|
236
|
+
return (with_field / total) * 100
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""MySQL connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
8
|
+
from duckguard.core.dataset import Dataset
|
|
9
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MySQLConnector(Connector):
|
|
13
|
+
"""
|
|
14
|
+
Connector for MySQL databases.
|
|
15
|
+
|
|
16
|
+
Uses DuckDB's mysql extension for efficient query pushdown.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
20
|
+
super().__init__(engine)
|
|
21
|
+
self._setup_extension()
|
|
22
|
+
|
|
23
|
+
def _setup_extension(self) -> None:
|
|
24
|
+
"""Install and load the mysql extension."""
|
|
25
|
+
try:
|
|
26
|
+
self.engine.execute("INSTALL mysql")
|
|
27
|
+
self.engine.execute("LOAD mysql")
|
|
28
|
+
except Exception:
|
|
29
|
+
# Extension might already be loaded
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
33
|
+
"""
|
|
34
|
+
Connect to MySQL and return a Dataset.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
config: Connection configuration
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dataset object
|
|
41
|
+
"""
|
|
42
|
+
if not config.table:
|
|
43
|
+
raise ValueError("Table name is required for MySQL connections")
|
|
44
|
+
|
|
45
|
+
# Parse connection string
|
|
46
|
+
conn_info = self._parse_connection_string(config.source)
|
|
47
|
+
|
|
48
|
+
table = config.table
|
|
49
|
+
database = config.database or conn_info.get("database", "")
|
|
50
|
+
|
|
51
|
+
# Create a unique alias for this connection
|
|
52
|
+
alias = f"mysql_{table}"
|
|
53
|
+
|
|
54
|
+
# Build MySQL connection string for DuckDB
|
|
55
|
+
mysql_conn = self._build_duckdb_connection(conn_info)
|
|
56
|
+
|
|
57
|
+
# Attach the database
|
|
58
|
+
attach_sql = f"ATTACH '{mysql_conn}' AS {alias} (TYPE mysql)"
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
self.engine.execute(attach_sql)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
if "already exists" not in str(e).lower():
|
|
64
|
+
raise
|
|
65
|
+
|
|
66
|
+
# The source reference for DuckDB
|
|
67
|
+
if database:
|
|
68
|
+
source_ref = f"{alias}.{database}.{table}"
|
|
69
|
+
else:
|
|
70
|
+
source_ref = f"{alias}.{table}"
|
|
71
|
+
|
|
72
|
+
# Register as a view for easier access
|
|
73
|
+
view_name = f"_duckguard_{table}"
|
|
74
|
+
try:
|
|
75
|
+
self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
return Dataset(source=view_name, engine=self.engine, name=table)
|
|
80
|
+
|
|
81
|
+
def _parse_connection_string(self, conn_string: str) -> dict[str, str]:
|
|
82
|
+
"""Parse MySQL connection string."""
|
|
83
|
+
# Handle mysql+pymysql:// format
|
|
84
|
+
conn_string = conn_string.replace("mysql+pymysql://", "mysql://")
|
|
85
|
+
|
|
86
|
+
parsed = urlparse(conn_string)
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"host": parsed.hostname or "localhost",
|
|
90
|
+
"port": str(parsed.port or 3306),
|
|
91
|
+
"database": parsed.path.lstrip("/") if parsed.path else "",
|
|
92
|
+
"user": parsed.username or "",
|
|
93
|
+
"password": parsed.password or "",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
def _build_duckdb_connection(self, conn_info: dict[str, str]) -> str:
|
|
97
|
+
"""Build connection string for DuckDB MySQL extension."""
|
|
98
|
+
parts = []
|
|
99
|
+
|
|
100
|
+
if conn_info.get("host"):
|
|
101
|
+
parts.append(f"host={conn_info['host']}")
|
|
102
|
+
if conn_info.get("port"):
|
|
103
|
+
parts.append(f"port={conn_info['port']}")
|
|
104
|
+
if conn_info.get("user"):
|
|
105
|
+
parts.append(f"user={conn_info['user']}")
|
|
106
|
+
if conn_info.get("password"):
|
|
107
|
+
parts.append(f"password={conn_info['password']}")
|
|
108
|
+
if conn_info.get("database"):
|
|
109
|
+
parts.append(f"database={conn_info['database']}")
|
|
110
|
+
|
|
111
|
+
return " ".join(parts)
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def can_handle(cls, source: str) -> bool:
|
|
115
|
+
"""Check if this is a MySQL connection string."""
|
|
116
|
+
return source.lower().startswith(("mysql://", "mysql+pymysql://"))
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def get_priority(cls) -> int:
|
|
120
|
+
"""Database connectors have high priority."""
|
|
121
|
+
return 50
|