duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,343 @@
1
+ """Apache Kafka connector for streaming data quality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+ from urllib.parse import parse_qs, urlparse
8
+
9
+ from duckguard.connectors.base import Connector, ConnectionConfig
10
+ from duckguard.core.dataset import Dataset
11
+ from duckguard.core.engine import DuckGuardEngine
12
+
13
+
14
+ class KafkaConnector(Connector):
15
+ """
16
+ Connector for Apache Kafka topics.
17
+
18
+ Consumes messages from a Kafka topic and validates them.
19
+ Supports JSON, Avro, and string message formats.
20
+
21
+ Examples:
22
+ # Using connection string
23
+ data = connect(
24
+ "kafka://broker1:9092,broker2:9092/my-topic",
25
+ sample_size=1000
26
+ )
27
+
28
+ # Using options
29
+ data = connect(
30
+ "kafka://localhost:9092",
31
+ table="my-topic", # topic name
32
+ group_id="duckguard-validator",
33
+ sample_size=5000,
34
+ format="json"
35
+ )
36
+
37
+ # With authentication
38
+ data = connect(
39
+ "kafka://broker:9092/topic",
40
+ security_protocol="SASL_SSL",
41
+ sasl_mechanism="PLAIN",
42
+ sasl_username="user",
43
+ sasl_password="pass"
44
+ )
45
+ """
46
+
47
+ def __init__(self, engine: DuckGuardEngine | None = None):
48
+ super().__init__(engine)
49
+ self._consumer = None
50
+
51
+ def connect(self, config: ConnectionConfig) -> Dataset:
52
+ """
53
+ Connect to Kafka and return a Dataset.
54
+
55
+ Args:
56
+ config: Connection configuration
57
+
58
+ Returns:
59
+ Dataset object
60
+ """
61
+ try:
62
+ from kafka import KafkaConsumer
63
+ except ImportError:
64
+ raise ImportError(
65
+ "Kafka support requires kafka-python. "
66
+ "Install with: pip install duckguard[kafka]"
67
+ )
68
+
69
+ # Parse connection parameters
70
+ conn_params = self._parse_connection_string(config.source, config)
71
+
72
+ topic = config.table or conn_params.get("topic")
73
+ if not topic:
74
+ raise ValueError("Topic name is required for Kafka connections")
75
+
76
+ bootstrap_servers = conn_params.get("bootstrap_servers", "localhost:9092")
77
+ group_id = conn_params.get("group_id", "duckguard-validator")
78
+ sample_size = conn_params.get("sample_size", 1000)
79
+ message_format = conn_params.get("format", "json")
80
+
81
+ # Build consumer config
82
+ consumer_config = {
83
+ "bootstrap_servers": bootstrap_servers,
84
+ "group_id": group_id,
85
+ "auto_offset_reset": "earliest",
86
+ "enable_auto_commit": False,
87
+ "consumer_timeout_ms": conn_params.get("timeout_ms", 10000),
88
+ }
89
+
90
+ # Add security config if present
91
+ if conn_params.get("security_protocol"):
92
+ consumer_config["security_protocol"] = conn_params["security_protocol"]
93
+ if conn_params.get("sasl_mechanism"):
94
+ consumer_config["sasl_mechanism"] = conn_params["sasl_mechanism"]
95
+ if conn_params.get("sasl_username"):
96
+ consumer_config["sasl_plain_username"] = conn_params["sasl_username"]
97
+ if conn_params.get("sasl_password"):
98
+ consumer_config["sasl_plain_password"] = conn_params["sasl_password"]
99
+
100
+ # Create consumer
101
+ self._consumer = KafkaConsumer(topic, **consumer_config)
102
+
103
+ return KafkaDataset(
104
+ source=topic,
105
+ engine=self.engine,
106
+ name=topic,
107
+ consumer=self._consumer,
108
+ sample_size=sample_size,
109
+ message_format=message_format,
110
+ )
111
+
112
+ def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
113
+ """Parse Kafka connection string."""
114
+ params: dict[str, Any] = {}
115
+
116
+ # Parse URL format: kafka://broker1:9092,broker2:9092/topic
117
+ if conn_string.lower().startswith("kafka://"):
118
+ # Remove protocol
119
+ rest = conn_string[8:]
120
+
121
+ # Split path
122
+ if "/" in rest:
123
+ brokers_part, path = rest.split("/", 1)
124
+ params["topic"] = path.split("?")[0] if path else None
125
+ else:
126
+ brokers_part = rest.split("?")[0]
127
+
128
+ params["bootstrap_servers"] = brokers_part
129
+
130
+ # Parse query parameters
131
+ parsed = urlparse(conn_string)
132
+ if parsed.query:
133
+ query_params = parse_qs(parsed.query)
134
+ for key, values in query_params.items():
135
+ params[key] = values[0] if len(values) == 1 else values
136
+
137
+ # Override with config options
138
+ options = config.options or {}
139
+ for key in [
140
+ "bootstrap_servers",
141
+ "group_id",
142
+ "sample_size",
143
+ "format",
144
+ "timeout_ms",
145
+ "security_protocol",
146
+ "sasl_mechanism",
147
+ "sasl_username",
148
+ "sasl_password",
149
+ ]:
150
+ if key in options:
151
+ params[key] = options[key]
152
+
153
+ if config.table:
154
+ params["topic"] = config.table
155
+
156
+ return params
157
+
158
+ @classmethod
159
+ def can_handle(cls, source: str) -> bool:
160
+ """Check if this is a Kafka connection string."""
161
+ return source.lower().startswith("kafka://")
162
+
163
+ @classmethod
164
+ def get_priority(cls) -> int:
165
+ """Kafka connector has high priority."""
166
+ return 55
167
+
168
+
169
+ class KafkaDataset(Dataset):
170
+ """
171
+ Dataset that consumes from Kafka topic.
172
+
173
+ Samples messages and loads them into DuckDB for validation.
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ source: str,
179
+ engine: DuckGuardEngine,
180
+ name: str,
181
+ consumer: Any,
182
+ sample_size: int = 1000,
183
+ message_format: str = "json",
184
+ ):
185
+ super().__init__(source=source, engine=engine, name=name)
186
+ self._consumer = consumer
187
+ self._sample_size = sample_size
188
+ self._message_format = message_format
189
+ self._loaded = False
190
+ self._view_name = f"_duckguard_kafka_{name.replace('-', '_')}"
191
+ self._messages_consumed = 0
192
+
193
+ def _ensure_loaded(self) -> None:
194
+ """Consume messages and load into DuckDB if not already done."""
195
+ if self._loaded:
196
+ return
197
+
198
+ try:
199
+ import pandas as pd
200
+ except ImportError:
201
+ raise ImportError(
202
+ "Kafka connector requires pandas for data loading. "
203
+ "Install with: pip install pandas"
204
+ )
205
+
206
+ messages = []
207
+ count = 0
208
+
209
+ # Consume messages
210
+ for message in self._consumer:
211
+ try:
212
+ if self._message_format == "json":
213
+ value = json.loads(message.value.decode("utf-8"))
214
+ else:
215
+ value = {"value": message.value.decode("utf-8")}
216
+
217
+ # Add metadata
218
+ value["_kafka_topic"] = message.topic
219
+ value["_kafka_partition"] = message.partition
220
+ value["_kafka_offset"] = message.offset
221
+ value["_kafka_timestamp"] = message.timestamp
222
+
223
+ messages.append(value)
224
+ count += 1
225
+
226
+ if count >= self._sample_size:
227
+ break
228
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
229
+ # Skip malformed messages but track them
230
+ messages.append(
231
+ {
232
+ "_kafka_topic": message.topic,
233
+ "_kafka_partition": message.partition,
234
+ "_kafka_offset": message.offset,
235
+ "_kafka_error": str(e),
236
+ }
237
+ )
238
+ count += 1
239
+
240
+ self._messages_consumed = count
241
+
242
+ if not messages:
243
+ df = pd.DataFrame()
244
+ else:
245
+ df = pd.json_normalize(messages)
246
+
247
+ # Register with DuckDB
248
+ self._engine.conn.register(self._view_name, df)
249
+ self._source = self._view_name
250
+ self._loaded = True
251
+
252
+ # Close consumer
253
+ self._consumer.close()
254
+
255
+ @property
256
+ def row_count(self) -> int:
257
+ """Get number of messages consumed."""
258
+ self._ensure_loaded()
259
+ return self._messages_consumed
260
+
261
+ @property
262
+ def columns(self) -> list[str]:
263
+ """Get column names from consumed messages."""
264
+ if self._columns_cache is None:
265
+ self._ensure_loaded()
266
+ self._columns_cache = self._engine.get_columns(self._view_name)
267
+ return self._columns_cache
268
+
269
+ @property
270
+ def messages_consumed(self) -> int:
271
+ """Get the actual number of messages consumed."""
272
+ self._ensure_loaded()
273
+ return self._messages_consumed
274
+
275
+ @property
276
+ def parse_error_count(self) -> int:
277
+ """Get count of messages that failed to parse."""
278
+ self._ensure_loaded()
279
+ sql = f"SELECT COUNT(*) FROM {self._view_name} WHERE _kafka_error IS NOT NULL"
280
+ return self._engine.fetch_value(sql) or 0
281
+
282
+
283
+ class KafkaStreamValidator:
284
+ """
285
+ Continuous streaming validator for Kafka.
286
+
287
+ Validates messages in real-time as they arrive.
288
+
289
+ Example:
290
+ validator = KafkaStreamValidator(
291
+ "kafka://localhost:9092/orders",
292
+ rules=[
293
+ lambda msg: msg.get("amount", 0) > 0,
294
+ lambda msg: msg.get("customer_id") is not None,
295
+ ]
296
+ )
297
+
298
+ # Start validation (blocks)
299
+ validator.start()
300
+
301
+ # Or get validation results
302
+ for result in validator.validate_stream():
303
+ if not result.passed:
304
+ print(f"Validation failed: {result.message}")
305
+ """
306
+
307
+ def __init__(
308
+ self,
309
+ source: str,
310
+ rules: list[callable] | None = None,
311
+ **options: Any,
312
+ ):
313
+ self.source = source
314
+ self.rules = rules or []
315
+ self.options = options
316
+ self._consumer = None
317
+ self._stats = {
318
+ "messages_processed": 0,
319
+ "messages_passed": 0,
320
+ "messages_failed": 0,
321
+ }
322
+
323
+ def add_rule(self, rule: callable) -> "KafkaStreamValidator":
324
+ """Add a validation rule."""
325
+ self.rules.append(rule)
326
+ return self
327
+
328
+ def validate_message(self, message: dict) -> tuple[bool, list[str]]:
329
+ """Validate a single message against all rules."""
330
+ failures = []
331
+ for i, rule in enumerate(self.rules):
332
+ try:
333
+ if not rule(message):
334
+ failures.append(f"Rule {i + 1} failed")
335
+ except Exception as e:
336
+ failures.append(f"Rule {i + 1} error: {e}")
337
+
338
+ return len(failures) == 0, failures
339
+
340
+ @property
341
+ def stats(self) -> dict[str, int]:
342
+ """Get validation statistics."""
343
+ return self._stats.copy()
@@ -0,0 +1,236 @@
1
+ """MongoDB connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class MongoDBConnector(Connector):
14
+ """
15
+ Connector for MongoDB.
16
+
17
+ Uses pymongo for connectivity. Converts MongoDB collections to
18
+ a tabular format for validation using DuckDB.
19
+
20
+ Examples:
21
+ # Using connection string
22
+ data = connect(
23
+ "mongodb://user:pass@host:27017/database",
24
+ table="orders" # collection name
25
+ )
26
+
27
+ # Using MongoDB Atlas
28
+ data = connect(
29
+ "mongodb+srv://user:pass@cluster.mongodb.net/database",
30
+ table="orders"
31
+ )
32
+
33
+ # Using options
34
+ data = connect(
35
+ "mongodb://host:27017",
36
+ table="orders",
37
+ database="mydb",
38
+ sample_size=10000 # Sample for large collections
39
+ )
40
+ """
41
+
42
+ def __init__(self, engine: DuckGuardEngine | None = None):
43
+ super().__init__(engine)
44
+ self._client = None
45
+ self._db = None
46
+
47
+ def connect(self, config: ConnectionConfig) -> Dataset:
48
+ """
49
+ Connect to MongoDB and return a Dataset.
50
+
51
+ Args:
52
+ config: Connection configuration
53
+
54
+ Returns:
55
+ Dataset object
56
+ """
57
+ try:
58
+ from pymongo import MongoClient
59
+ except ImportError:
60
+ raise ImportError(
61
+ "MongoDB support requires pymongo. "
62
+ "Install with: pip install duckguard[mongodb]"
63
+ )
64
+
65
+ if not config.table:
66
+ raise ValueError(
67
+ "Collection name is required for MongoDB connections (use table parameter)"
68
+ )
69
+
70
+ # Parse connection parameters
71
+ conn_params = self._parse_connection_string(config.source, config)
72
+
73
+ # Connect to MongoDB
74
+ self._client = MongoClient(conn_params["connection_string"])
75
+
76
+ database_name = config.database or conn_params.get("database")
77
+ if not database_name:
78
+ raise ValueError("Database name is required for MongoDB connections")
79
+
80
+ self._db = self._client[database_name]
81
+ collection_name = config.table
82
+
83
+ # Get sample size from options
84
+ sample_size = (config.options or {}).get("sample_size", 100000)
85
+
86
+ return MongoDBDataset(
87
+ source=collection_name,
88
+ engine=self.engine,
89
+ name=collection_name,
90
+ database=self._db,
91
+ collection_name=collection_name,
92
+ sample_size=sample_size,
93
+ )
94
+
95
+ def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
96
+ """Parse MongoDB connection string."""
97
+ params: dict[str, Any] = {}
98
+
99
+ # Keep the full connection string for pymongo
100
+ params["connection_string"] = conn_string
101
+
102
+ # Parse to extract database name if present
103
+ if conn_string.lower().startswith(("mongodb://", "mongodb+srv://")):
104
+ parsed = urlparse(conn_string)
105
+ if parsed.path and parsed.path != "/":
106
+ params["database"] = parsed.path.lstrip("/").split("?")[0]
107
+
108
+ # Override with config options
109
+ if config.database:
110
+ params["database"] = config.database
111
+
112
+ return params
113
+
114
+ @classmethod
115
+ def can_handle(cls, source: str) -> bool:
116
+ """Check if this is a MongoDB connection string."""
117
+ source_lower = source.lower()
118
+ return source_lower.startswith(("mongodb://", "mongodb+srv://"))
119
+
120
+ @classmethod
121
+ def get_priority(cls) -> int:
122
+ """MongoDB connector has high priority."""
123
+ return 55
124
+
125
+
126
+ class MongoDBDataset(Dataset):
127
+ """
128
+ Dataset that queries MongoDB.
129
+
130
+ Loads data from MongoDB collection into DuckDB for validation.
131
+ """
132
+
133
+ def __init__(
134
+ self,
135
+ source: str,
136
+ engine: DuckGuardEngine,
137
+ name: str,
138
+ database: Any,
139
+ collection_name: str,
140
+ sample_size: int = 100000,
141
+ ):
142
+ super().__init__(source=source, engine=engine, name=name)
143
+ self._database = database
144
+ self._collection_name = collection_name
145
+ self._sample_size = sample_size
146
+ self._collection = database[collection_name]
147
+ self._loaded = False
148
+ self._view_name = f"_duckguard_mongo_{collection_name}"
149
+
150
+ def _ensure_loaded(self) -> None:
151
+ """Load MongoDB data into DuckDB if not already loaded."""
152
+ if self._loaded:
153
+ return
154
+
155
+ try:
156
+ import pandas as pd
157
+ except ImportError:
158
+ raise ImportError(
159
+ "MongoDB connector requires pandas for data loading. "
160
+ "Install with: pip install pandas"
161
+ )
162
+
163
+ # Get documents from MongoDB
164
+ cursor = self._collection.find().limit(self._sample_size)
165
+ documents = list(cursor)
166
+
167
+ if not documents:
168
+ # Create empty dataframe with no columns
169
+ df = pd.DataFrame()
170
+ else:
171
+ # Flatten nested documents and convert to DataFrame
172
+ df = pd.json_normalize(documents)
173
+
174
+ # Convert ObjectId to string
175
+ if "_id" in df.columns:
176
+ df["_id"] = df["_id"].astype(str)
177
+
178
+ # Register with DuckDB
179
+ self._engine.conn.register(self._view_name, df)
180
+ self._source = self._view_name
181
+ self._loaded = True
182
+
183
+ @property
184
+ def row_count(self) -> int:
185
+ """Get row count."""
186
+ if self._row_count_cache is None:
187
+ # Use MongoDB count for accuracy
188
+ self._row_count_cache = self._collection.count_documents({})
189
+ return self._row_count_cache
190
+
191
+ @property
192
+ def columns(self) -> list[str]:
193
+ """Get column names (field names from documents)."""
194
+ if self._columns_cache is None:
195
+ self._ensure_loaded()
196
+ self._columns_cache = self._engine.get_columns(self._view_name)
197
+ return self._columns_cache
198
+
199
+ @property
200
+ def sample_row_count(self) -> int:
201
+ """Get the number of rows in the sample (may be less than total)."""
202
+ self._ensure_loaded()
203
+ return self._engine.get_row_count(self._view_name)
204
+
205
+
206
+ class MongoDBColumn:
207
+ """Column for MongoDB datasets with document-aware validation."""
208
+
209
+ def __init__(self, name: str, dataset: MongoDBDataset):
210
+ self._name = name
211
+ self._dataset = dataset
212
+
213
+ @property
214
+ def null_percent(self) -> float:
215
+ """Get null/missing percentage."""
216
+ self._dataset._ensure_loaded()
217
+ stats = self._dataset._engine.get_column_stats(
218
+ self._dataset._view_name, self._name
219
+ )
220
+ return stats.get("null_percent", 0.0)
221
+
222
+ @property
223
+ def field_exists_percent(self) -> float:
224
+ """
225
+ Get percentage of documents that have this field.
226
+
227
+ This is MongoDB-specific - checks for field existence.
228
+ """
229
+ total = self._dataset._collection.count_documents({})
230
+ if total == 0:
231
+ return 100.0
232
+
233
+ with_field = self._dataset._collection.count_documents(
234
+ {self._name: {"$exists": True}}
235
+ )
236
+ return (with_field / total) * 100
@@ -0,0 +1,121 @@
1
+ """MySQL connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from urllib.parse import urlparse
6
+
7
+ from duckguard.connectors.base import Connector, ConnectionConfig
8
+ from duckguard.core.dataset import Dataset
9
+ from duckguard.core.engine import DuckGuardEngine
10
+
11
+
12
+ class MySQLConnector(Connector):
13
+ """
14
+ Connector for MySQL databases.
15
+
16
+ Uses DuckDB's mysql extension for efficient query pushdown.
17
+ """
18
+
19
+ def __init__(self, engine: DuckGuardEngine | None = None):
20
+ super().__init__(engine)
21
+ self._setup_extension()
22
+
23
+ def _setup_extension(self) -> None:
24
+ """Install and load the mysql extension."""
25
+ try:
26
+ self.engine.execute("INSTALL mysql")
27
+ self.engine.execute("LOAD mysql")
28
+ except Exception:
29
+ # Extension might already be loaded
30
+ pass
31
+
32
+ def connect(self, config: ConnectionConfig) -> Dataset:
33
+ """
34
+ Connect to MySQL and return a Dataset.
35
+
36
+ Args:
37
+ config: Connection configuration
38
+
39
+ Returns:
40
+ Dataset object
41
+ """
42
+ if not config.table:
43
+ raise ValueError("Table name is required for MySQL connections")
44
+
45
+ # Parse connection string
46
+ conn_info = self._parse_connection_string(config.source)
47
+
48
+ table = config.table
49
+ database = config.database or conn_info.get("database", "")
50
+
51
+ # Create a unique alias for this connection
52
+ alias = f"mysql_{table}"
53
+
54
+ # Build MySQL connection string for DuckDB
55
+ mysql_conn = self._build_duckdb_connection(conn_info)
56
+
57
+ # Attach the database
58
+ attach_sql = f"ATTACH '{mysql_conn}' AS {alias} (TYPE mysql)"
59
+
60
+ try:
61
+ self.engine.execute(attach_sql)
62
+ except Exception as e:
63
+ if "already exists" not in str(e).lower():
64
+ raise
65
+
66
+ # The source reference for DuckDB
67
+ if database:
68
+ source_ref = f"{alias}.{database}.{table}"
69
+ else:
70
+ source_ref = f"{alias}.{table}"
71
+
72
+ # Register as a view for easier access
73
+ view_name = f"_duckguard_{table}"
74
+ try:
75
+ self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
76
+ except Exception:
77
+ pass
78
+
79
+ return Dataset(source=view_name, engine=self.engine, name=table)
80
+
81
+ def _parse_connection_string(self, conn_string: str) -> dict[str, str]:
82
+ """Parse MySQL connection string."""
83
+ # Handle mysql+pymysql:// format
84
+ conn_string = conn_string.replace("mysql+pymysql://", "mysql://")
85
+
86
+ parsed = urlparse(conn_string)
87
+
88
+ return {
89
+ "host": parsed.hostname or "localhost",
90
+ "port": str(parsed.port or 3306),
91
+ "database": parsed.path.lstrip("/") if parsed.path else "",
92
+ "user": parsed.username or "",
93
+ "password": parsed.password or "",
94
+ }
95
+
96
+ def _build_duckdb_connection(self, conn_info: dict[str, str]) -> str:
97
+ """Build connection string for DuckDB MySQL extension."""
98
+ parts = []
99
+
100
+ if conn_info.get("host"):
101
+ parts.append(f"host={conn_info['host']}")
102
+ if conn_info.get("port"):
103
+ parts.append(f"port={conn_info['port']}")
104
+ if conn_info.get("user"):
105
+ parts.append(f"user={conn_info['user']}")
106
+ if conn_info.get("password"):
107
+ parts.append(f"password={conn_info['password']}")
108
+ if conn_info.get("database"):
109
+ parts.append(f"database={conn_info['database']}")
110
+
111
+ return " ".join(parts)
112
+
113
+ @classmethod
114
+ def can_handle(cls, source: str) -> bool:
115
+ """Check if this is a MySQL connection string."""
116
+ return source.lower().startswith(("mysql://", "mysql+pymysql://"))
117
+
118
+ @classmethod
119
+ def get_priority(cls) -> int:
120
+ """Database connectors have high priority."""
121
+ return 50