datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""Base connector interface and registry for database connectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import date
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ConnectorConfig:
|
|
14
|
+
"""Configuration for a database connector."""
|
|
15
|
+
|
|
16
|
+
connector_type: str
|
|
17
|
+
host: str = ""
|
|
18
|
+
port: int = 0
|
|
19
|
+
database: str = ""
|
|
20
|
+
schema: str = ""
|
|
21
|
+
user: str = ""
|
|
22
|
+
password: str = ""
|
|
23
|
+
warehouse: str = ""
|
|
24
|
+
project: str = ""
|
|
25
|
+
dataset: str = ""
|
|
26
|
+
catalog: str = ""
|
|
27
|
+
token: str = ""
|
|
28
|
+
private_key_path: str = ""
|
|
29
|
+
connection_string: str = ""
|
|
30
|
+
tables: Optional[List[str]] = None
|
|
31
|
+
exclude_tables: Optional[List[str]] = None
|
|
32
|
+
model_name: str = "imported_model"
|
|
33
|
+
domain: str = "imported"
|
|
34
|
+
owners: Optional[List[str]] = None
|
|
35
|
+
extra: Dict[str, Any] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
def effective_owners(self) -> List[str]:
|
|
38
|
+
return self.owners or ["data-team@example.com"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ConnectorResult:
|
|
43
|
+
"""Result of a schema pull operation."""
|
|
44
|
+
|
|
45
|
+
model: Dict[str, Any]
|
|
46
|
+
tables_found: int = 0
|
|
47
|
+
columns_found: int = 0
|
|
48
|
+
relationships_found: int = 0
|
|
49
|
+
indexes_found: int = 0
|
|
50
|
+
warnings: List[str] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
def summary(self) -> str:
|
|
53
|
+
lines = [
|
|
54
|
+
f"Tables: {self.tables_found}",
|
|
55
|
+
f"Columns: {self.columns_found}",
|
|
56
|
+
f"Relationships: {self.relationships_found}",
|
|
57
|
+
f"Indexes: {self.indexes_found}",
|
|
58
|
+
]
|
|
59
|
+
if self.warnings:
|
|
60
|
+
lines.append(f"Warnings: {len(self.warnings)}")
|
|
61
|
+
for w in self.warnings:
|
|
62
|
+
lines.append(f" - {w}")
|
|
63
|
+
return "\n".join(lines)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _to_pascal(name: str) -> str:
|
|
67
|
+
name = name.replace('"', "")
|
|
68
|
+
parts = re.split(r"[^A-Za-z0-9]+", name)
|
|
69
|
+
return "".join(part[:1].upper() + part[1:] for part in parts if part)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _to_model_name(text: str) -> str:
|
|
73
|
+
cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()
|
|
74
|
+
return cleaned or "imported_model"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _default_model(model_name: str, domain: str, owners: List[str]) -> Dict[str, Any]:
|
|
78
|
+
return {
|
|
79
|
+
"model": {
|
|
80
|
+
"name": _to_model_name(model_name),
|
|
81
|
+
"version": "1.0.0",
|
|
82
|
+
"domain": domain,
|
|
83
|
+
"owners": owners,
|
|
84
|
+
"state": "draft",
|
|
85
|
+
},
|
|
86
|
+
"entities": [],
|
|
87
|
+
"relationships": [],
|
|
88
|
+
"indexes": [],
|
|
89
|
+
"governance": {"classification": {}, "stewards": {}},
|
|
90
|
+
"rules": [],
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# Relationship & PK inference for databases without constraints
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
# Common PK column patterns (case-insensitive)
|
|
99
|
+
_PK_PATTERNS = [
|
|
100
|
+
re.compile(r"^id$", re.IGNORECASE),
|
|
101
|
+
re.compile(r"^pk$", re.IGNORECASE),
|
|
102
|
+
re.compile(r"^(.+)_id$", re.IGNORECASE), # only if matches table name
|
|
103
|
+
re.compile(r"^(.+)_pk$", re.IGNORECASE),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Common FK column patterns: <table>_id, <table>_fk, <table>Id, fk_<table>
|
|
107
|
+
_FK_PATTERNS = [
|
|
108
|
+
re.compile(r"^(.+)_id$", re.IGNORECASE),
|
|
109
|
+
re.compile(r"^(.+)_fk$", re.IGNORECASE),
|
|
110
|
+
re.compile(r"^fk_(.+)$", re.IGNORECASE),
|
|
111
|
+
re.compile(r"^(.+)Id$"), # camelCase: orderId, userId
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _normalize(name: str) -> str:
|
|
116
|
+
"""Normalize a name for fuzzy matching: lowercase, strip underscores/hyphens."""
|
|
117
|
+
return re.sub(r"[_\-\s]+", "", name).lower()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _plurals(name: str) -> List[str]:
|
|
121
|
+
"""Return plausible singular/plural variants of a normalized name."""
|
|
122
|
+
variants = []
|
|
123
|
+
if name.endswith("ies"):
|
|
124
|
+
variants.append(name[:-3] + "y") # "categories" → "category"
|
|
125
|
+
if name.endswith("ses") or name.endswith("xes") or name.endswith("zes"):
|
|
126
|
+
variants.append(name[:-2]) # "addresses" → "address"
|
|
127
|
+
if name.endswith("s") and not name.endswith("ss"):
|
|
128
|
+
variants.append(name[:-1]) # "orders" → "order"
|
|
129
|
+
if not name.endswith("s"):
|
|
130
|
+
variants.append(name + "s") # "order" → "orders"
|
|
131
|
+
if name.endswith("y") and not name.endswith("ey"):
|
|
132
|
+
variants.append(name[:-1] + "ies") # "category" → "categories"
|
|
133
|
+
return variants
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _build_entity_lookup(entities: List[Dict[str, Any]]) -> Dict[str, str]:
|
|
137
|
+
"""Build a lookup from normalized table/entity name → actual entity name.
|
|
138
|
+
|
|
139
|
+
Includes both the entity name and singular/plural variants for flexible matching.
|
|
140
|
+
"""
|
|
141
|
+
lookup: Dict[str, str] = {}
|
|
142
|
+
for entity in entities:
|
|
143
|
+
ename = entity["name"]
|
|
144
|
+
norm = _normalize(ename)
|
|
145
|
+
lookup[norm] = ename
|
|
146
|
+
for variant in _plurals(norm):
|
|
147
|
+
if variant not in lookup:
|
|
148
|
+
lookup[variant] = ename
|
|
149
|
+
return lookup
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def infer_primary_keys(entities: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[str]]:
|
|
153
|
+
"""Infer primary keys for entities that have no PK defined.
|
|
154
|
+
|
|
155
|
+
Heuristics (in priority order):
|
|
156
|
+
1. Column named exactly 'id'
|
|
157
|
+
2. Column named '<table_name>_id' (e.g., order_id in orders table)
|
|
158
|
+
3. Column named '<table_name>_pk'
|
|
159
|
+
4. First column if it ends with '_id' or '_pk'
|
|
160
|
+
|
|
161
|
+
Returns (modified entities, list of inference messages).
|
|
162
|
+
"""
|
|
163
|
+
messages: List[str] = []
|
|
164
|
+
|
|
165
|
+
for entity in entities:
|
|
166
|
+
fields = entity.get("fields", [])
|
|
167
|
+
# Skip if any field already has primary_key
|
|
168
|
+
if any(f.get("primary_key") for f in fields):
|
|
169
|
+
continue
|
|
170
|
+
if not fields:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
ename_norm = _normalize(entity["name"])
|
|
174
|
+
inferred_pk = None
|
|
175
|
+
|
|
176
|
+
# Priority 1: column named 'id'
|
|
177
|
+
for f in fields:
|
|
178
|
+
if f["name"].lower() == "id":
|
|
179
|
+
inferred_pk = f
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
# Priority 2: column named '<entity>_id' or '<entity>_pk'
|
|
183
|
+
if not inferred_pk:
|
|
184
|
+
for f in fields:
|
|
185
|
+
fname = f["name"].lower()
|
|
186
|
+
if fname == f"{ename_norm}_id" or fname == f"{ename_norm}_pk":
|
|
187
|
+
inferred_pk = f
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
# Priority 3: first column ending in _id or _pk
|
|
191
|
+
if not inferred_pk:
|
|
192
|
+
for f in fields:
|
|
193
|
+
fname = f["name"].lower()
|
|
194
|
+
if fname.endswith("_id") or fname.endswith("_pk"):
|
|
195
|
+
inferred_pk = f
|
|
196
|
+
break
|
|
197
|
+
|
|
198
|
+
if inferred_pk:
|
|
199
|
+
inferred_pk["primary_key"] = True
|
|
200
|
+
inferred_pk["nullable"] = False
|
|
201
|
+
messages.append(f"Inferred PK: {entity['name']}.{inferred_pk['name']}")
|
|
202
|
+
|
|
203
|
+
return entities, messages
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def infer_relationships(
|
|
207
|
+
entities: List[Dict[str, Any]],
|
|
208
|
+
existing_relationships: Optional[List[Dict[str, Any]]] = None,
|
|
209
|
+
) -> Tuple[List[Dict[str, Any]], List[str]]:
|
|
210
|
+
"""Infer foreign key relationships from column naming conventions.
|
|
211
|
+
|
|
212
|
+
Detects patterns like:
|
|
213
|
+
- user_id in orders table → Orders.user_id references Users.id
|
|
214
|
+
- customer_fk in invoices → Invoices.customer_fk references Customers.id
|
|
215
|
+
- fk_product in line_items → LineItems.fk_product references Products.id
|
|
216
|
+
|
|
217
|
+
Only creates relationships to entities that actually exist in the model.
|
|
218
|
+
Skips columns that are already marked as primary_key.
|
|
219
|
+
|
|
220
|
+
Returns (list of inferred relationships, list of inference messages).
|
|
221
|
+
"""
|
|
222
|
+
existing = existing_relationships or []
|
|
223
|
+
existing_pairs = set()
|
|
224
|
+
for rel in existing:
|
|
225
|
+
existing_pairs.add((rel.get("from", ""), rel.get("to", "")))
|
|
226
|
+
|
|
227
|
+
entity_lookup = _build_entity_lookup(entities)
|
|
228
|
+
|
|
229
|
+
# Build a map of entity_name → its PK field name
|
|
230
|
+
pk_map: Dict[str, str] = {}
|
|
231
|
+
for entity in entities:
|
|
232
|
+
for f in entity.get("fields", []):
|
|
233
|
+
if f.get("primary_key"):
|
|
234
|
+
pk_map[entity["name"]] = f["name"]
|
|
235
|
+
break
|
|
236
|
+
# Default to 'id' if no PK found
|
|
237
|
+
if entity["name"] not in pk_map:
|
|
238
|
+
pk_map[entity["name"]] = "id"
|
|
239
|
+
|
|
240
|
+
inferred: List[Dict[str, Any]] = []
|
|
241
|
+
messages: List[str] = []
|
|
242
|
+
|
|
243
|
+
for entity in entities:
|
|
244
|
+
entity_name = entity["name"]
|
|
245
|
+
for f in entity.get("fields", []):
|
|
246
|
+
# Skip fields already marked as PK
|
|
247
|
+
if f.get("primary_key"):
|
|
248
|
+
continue
|
|
249
|
+
# Skip fields already marked as FK
|
|
250
|
+
if f.get("foreign_key"):
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
fname = f["name"]
|
|
254
|
+
ref_table_norm = None
|
|
255
|
+
|
|
256
|
+
# Try each FK pattern
|
|
257
|
+
for pattern in _FK_PATTERNS:
|
|
258
|
+
m = pattern.match(fname)
|
|
259
|
+
if m:
|
|
260
|
+
ref_table_norm = _normalize(m.group(1))
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
if not ref_table_norm:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# Don't self-reference via the entity's own name_id pattern
|
|
267
|
+
if ref_table_norm == _normalize(entity_name):
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
# Look up the referenced entity
|
|
271
|
+
ref_entity = entity_lookup.get(ref_table_norm)
|
|
272
|
+
if not ref_entity:
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# Build the relationship
|
|
276
|
+
ref_pk = pk_map.get(ref_entity, "id")
|
|
277
|
+
from_key = f"{ref_entity}.{ref_pk}"
|
|
278
|
+
to_key = f"{entity_name}.{fname}"
|
|
279
|
+
|
|
280
|
+
# Skip if this relationship already exists
|
|
281
|
+
if (from_key, to_key) in existing_pairs:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
f["foreign_key"] = True
|
|
285
|
+
rel_name = f"{_normalize(ref_entity)}_{_normalize(entity_name)}_{fname}_inferred"
|
|
286
|
+
inferred.append({
|
|
287
|
+
"name": rel_name,
|
|
288
|
+
"from": from_key,
|
|
289
|
+
"to": to_key,
|
|
290
|
+
"cardinality": "one_to_many",
|
|
291
|
+
"inferred": True,
|
|
292
|
+
})
|
|
293
|
+
existing_pairs.add((from_key, to_key))
|
|
294
|
+
messages.append(f"Inferred FK: {entity_name}.{fname} → {ref_entity}.{ref_pk}")
|
|
295
|
+
|
|
296
|
+
return inferred, messages
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class BaseConnector(ABC):
|
|
300
|
+
"""Abstract base class for all database connectors."""
|
|
301
|
+
|
|
302
|
+
connector_type: str = ""
|
|
303
|
+
display_name: str = ""
|
|
304
|
+
required_package: str = ""
|
|
305
|
+
|
|
306
|
+
@abstractmethod
|
|
307
|
+
def test_connection(self, config: ConnectorConfig) -> Tuple[bool, str]:
|
|
308
|
+
"""Test if the connection can be established.
|
|
309
|
+
|
|
310
|
+
Returns (success, message).
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
@abstractmethod
|
|
314
|
+
def pull_schema(self, config: ConnectorConfig) -> ConnectorResult:
|
|
315
|
+
"""Pull schema from the database and return a ConnectorResult."""
|
|
316
|
+
|
|
317
|
+
def list_schemas(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
|
|
318
|
+
"""List available schemas/datasets in the database.
|
|
319
|
+
|
|
320
|
+
Returns a list of dicts with at least: {"name": str, "table_count": int}.
|
|
321
|
+
Override in subclasses.
|
|
322
|
+
"""
|
|
323
|
+
return []
|
|
324
|
+
|
|
325
|
+
def list_tables(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
|
|
326
|
+
"""List tables in the configured schema.
|
|
327
|
+
|
|
328
|
+
Returns a list of dicts with at least:
|
|
329
|
+
{"name": str, "type": str, "row_count": int|None, "column_count": int}.
|
|
330
|
+
Override in subclasses.
|
|
331
|
+
"""
|
|
332
|
+
return []
|
|
333
|
+
|
|
334
|
+
def check_driver(self) -> Tuple[bool, str]:
|
|
335
|
+
"""Check if the required Python driver package is installed."""
|
|
336
|
+
if not self.required_package:
|
|
337
|
+
return True, "No driver required"
|
|
338
|
+
try:
|
|
339
|
+
__import__(self.required_package)
|
|
340
|
+
return True, f"{self.required_package} is installed"
|
|
341
|
+
except ImportError:
|
|
342
|
+
return False, f"Missing driver: pip install {self.required_package}"
|
|
343
|
+
|
|
344
|
+
def _build_model(self, config: ConnectorConfig) -> Dict[str, Any]:
|
|
345
|
+
return _default_model(
|
|
346
|
+
model_name=config.model_name,
|
|
347
|
+
domain=config.domain,
|
|
348
|
+
owners=config.effective_owners(),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def _entity_name(self, table_name: str) -> str:
|
|
352
|
+
return _to_pascal(table_name)
|
|
353
|
+
|
|
354
|
+
def _should_include_table(self, table_name: str, config: ConnectorConfig) -> bool:
|
|
355
|
+
if config.tables and table_name not in config.tables:
|
|
356
|
+
return False
|
|
357
|
+
if config.exclude_tables and table_name in config.exclude_tables:
|
|
358
|
+
return False
|
|
359
|
+
return True
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
# ---------------------------------------------------------------------------
|
|
363
|
+
# Connector registry
|
|
364
|
+
# ---------------------------------------------------------------------------
|
|
365
|
+
|
|
366
|
+
_REGISTRY: Dict[str, BaseConnector] = {}
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _register(connector: BaseConnector) -> None:
|
|
370
|
+
_REGISTRY[connector.connector_type] = connector
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def get_connector(connector_type: str) -> Optional[BaseConnector]:
|
|
374
|
+
"""Get a connector by type name."""
|
|
375
|
+
return _REGISTRY.get(connector_type)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def list_connectors() -> List[Dict[str, str]]:
|
|
379
|
+
"""List all registered connectors."""
|
|
380
|
+
result = []
|
|
381
|
+
for name, conn in sorted(_REGISTRY.items()):
|
|
382
|
+
ok, msg = conn.check_driver()
|
|
383
|
+
result.append({
|
|
384
|
+
"type": name,
|
|
385
|
+
"name": conn.display_name,
|
|
386
|
+
"driver": conn.required_package or "none",
|
|
387
|
+
"installed": ok,
|
|
388
|
+
"status": msg,
|
|
389
|
+
})
|
|
390
|
+
return result
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def register_all() -> None:
|
|
394
|
+
"""Register all built-in connectors."""
|
|
395
|
+
from datalex_core.connectors.postgres import PostgresConnector
|
|
396
|
+
from datalex_core.connectors.mysql import MySQLConnector
|
|
397
|
+
from datalex_core.connectors.snowflake import SnowflakeConnector
|
|
398
|
+
from datalex_core.connectors.bigquery import BigQueryConnector
|
|
399
|
+
from datalex_core.connectors.databricks import DatabricksConnector
|
|
400
|
+
from datalex_core.connectors.sqlserver import SQLServerConnector, AzureSQLConnector, AzureFabricConnector
|
|
401
|
+
from datalex_core.connectors.redshift import RedshiftConnector
|
|
402
|
+
|
|
403
|
+
for cls in [
|
|
404
|
+
PostgresConnector,
|
|
405
|
+
MySQLConnector,
|
|
406
|
+
SnowflakeConnector,
|
|
407
|
+
BigQueryConnector,
|
|
408
|
+
DatabricksConnector,
|
|
409
|
+
SQLServerConnector,
|
|
410
|
+
AzureSQLConnector,
|
|
411
|
+
AzureFabricConnector,
|
|
412
|
+
RedshiftConnector,
|
|
413
|
+
]:
|
|
414
|
+
_register(cls())
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
register_all()
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""BigQuery connector — pulls schema from INFORMATION_SCHEMA."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import date
|
|
6
|
+
from typing import Any, Dict, List, Tuple
|
|
7
|
+
|
|
8
|
+
from datalex_core.connectors.base import BaseConnector, ConnectorConfig, ConnectorResult, infer_primary_keys, infer_relationships
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_BQ_TYPE_MAP = {
|
|
12
|
+
"STRING": "string",
|
|
13
|
+
"BYTES": "binary",
|
|
14
|
+
"INT64": "bigint",
|
|
15
|
+
"INTEGER": "integer",
|
|
16
|
+
"FLOAT64": "float",
|
|
17
|
+
"FLOAT": "float",
|
|
18
|
+
"NUMERIC": "decimal",
|
|
19
|
+
"BIGNUMERIC": "decimal",
|
|
20
|
+
"BOOLEAN": "boolean",
|
|
21
|
+
"BOOL": "boolean",
|
|
22
|
+
"TIMESTAMP": "timestamp",
|
|
23
|
+
"DATE": "date",
|
|
24
|
+
"TIME": "time",
|
|
25
|
+
"DATETIME": "timestamp",
|
|
26
|
+
"GEOGRAPHY": "string",
|
|
27
|
+
"RECORD": "json",
|
|
28
|
+
"STRUCT": "json",
|
|
29
|
+
"ARRAY": "json",
|
|
30
|
+
"JSON": "json",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BigQueryConnector(BaseConnector):
|
|
35
|
+
connector_type = "bigquery"
|
|
36
|
+
display_name = "Google BigQuery"
|
|
37
|
+
required_package = "google.cloud.bigquery"
|
|
38
|
+
|
|
39
|
+
def test_connection(self, config: ConnectorConfig) -> Tuple[bool, str]:
|
|
40
|
+
try:
|
|
41
|
+
from google.cloud import bigquery
|
|
42
|
+
client = bigquery.Client(project=config.project)
|
|
43
|
+
datasets = list(client.list_datasets(max_results=1))
|
|
44
|
+
return True, "Connection successful"
|
|
45
|
+
except ImportError:
|
|
46
|
+
return False, "google-cloud-bigquery not installed. Run: pip install google-cloud-bigquery"
|
|
47
|
+
except Exception as e:
|
|
48
|
+
return False, f"Connection failed: {e}"
|
|
49
|
+
|
|
50
|
+
def list_schemas(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
|
|
51
|
+
from google.cloud import bigquery
|
|
52
|
+
client = bigquery.Client(project=config.project)
|
|
53
|
+
results = []
|
|
54
|
+
for ds in client.list_datasets():
|
|
55
|
+
ds_ref = ds.reference
|
|
56
|
+
try:
|
|
57
|
+
tables = list(client.list_tables(ds_ref))
|
|
58
|
+
count = len(tables)
|
|
59
|
+
except Exception:
|
|
60
|
+
count = 0
|
|
61
|
+
results.append({"name": ds.dataset_id, "table_count": count})
|
|
62
|
+
return sorted(results, key=lambda x: x["name"])
|
|
63
|
+
|
|
64
|
+
def list_tables(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
|
|
65
|
+
from google.cloud import bigquery
|
|
66
|
+
client = bigquery.Client(project=config.project)
|
|
67
|
+
dataset = config.dataset
|
|
68
|
+
if not dataset:
|
|
69
|
+
return []
|
|
70
|
+
results = []
|
|
71
|
+
for tbl in client.list_tables(f"{config.project}.{dataset}"):
|
|
72
|
+
ttype = "view" if tbl.table_type == "VIEW" else "table"
|
|
73
|
+
# Get column count
|
|
74
|
+
try:
|
|
75
|
+
full = client.get_table(tbl.reference)
|
|
76
|
+
col_count = len(full.schema)
|
|
77
|
+
row_count = full.num_rows
|
|
78
|
+
except Exception:
|
|
79
|
+
col_count = 0
|
|
80
|
+
row_count = None
|
|
81
|
+
results.append({"name": tbl.table_id, "type": ttype, "column_count": col_count, "row_count": row_count})
|
|
82
|
+
return sorted(results, key=lambda x: x["name"])
|
|
83
|
+
|
|
84
|
+
def pull_schema(self, config: ConnectorConfig) -> ConnectorResult:
|
|
85
|
+
from google.cloud import bigquery
|
|
86
|
+
|
|
87
|
+
client = bigquery.Client(project=config.project)
|
|
88
|
+
return self._pull(client, config)
|
|
89
|
+
|
|
90
|
+
def _pull(self, client: Any, config: ConnectorConfig) -> ConnectorResult:
|
|
91
|
+
model = self._build_model(config)
|
|
92
|
+
project = config.project
|
|
93
|
+
dataset = config.dataset
|
|
94
|
+
warnings: List[str] = []
|
|
95
|
+
|
|
96
|
+
if not dataset:
|
|
97
|
+
warnings.append("No dataset specified. Use --dataset to filter.")
|
|
98
|
+
return ConnectorResult(model=model, warnings=warnings)
|
|
99
|
+
|
|
100
|
+
# --- Tables ---
|
|
101
|
+
query = f"""
|
|
102
|
+
SELECT table_name, table_type
|
|
103
|
+
FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`
|
|
104
|
+
ORDER BY table_name
|
|
105
|
+
"""
|
|
106
|
+
rows = client.query(query).result()
|
|
107
|
+
|
|
108
|
+
table_entities: Dict[str, Dict[str, Any]] = {}
|
|
109
|
+
for row in rows:
|
|
110
|
+
table_name = row.table_name
|
|
111
|
+
table_type = row.table_type
|
|
112
|
+
if not self._should_include_table(table_name, config):
|
|
113
|
+
continue
|
|
114
|
+
entity_name = self._entity_name(table_name)
|
|
115
|
+
entity_type = "view" if "VIEW" in table_type else "table"
|
|
116
|
+
table_entities[table_name] = {
|
|
117
|
+
"name": entity_name,
|
|
118
|
+
"physical_name": table_name,
|
|
119
|
+
"type": entity_type,
|
|
120
|
+
"description": f"Pulled from BigQuery {project}.{dataset}.{table_name} on {date.today().isoformat()}",
|
|
121
|
+
"fields": [],
|
|
122
|
+
"schema": dataset,
|
|
123
|
+
"database": project,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# --- Columns ---
|
|
127
|
+
query = f"""
|
|
128
|
+
SELECT table_name, column_name, data_type, is_nullable
|
|
129
|
+
FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
|
|
130
|
+
ORDER BY table_name, ordinal_position
|
|
131
|
+
"""
|
|
132
|
+
col_rows = client.query(query).result()
|
|
133
|
+
total_columns = 0
|
|
134
|
+
|
|
135
|
+
for row in col_rows:
|
|
136
|
+
tname = row.table_name
|
|
137
|
+
if tname not in table_entities:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
data_type = row.data_type or "STRING"
|
|
141
|
+
dl_type = _BQ_TYPE_MAP.get(data_type.upper(), "string")
|
|
142
|
+
|
|
143
|
+
field: Dict[str, Any] = {
|
|
144
|
+
"name": row.column_name,
|
|
145
|
+
"type": dl_type,
|
|
146
|
+
"nullable": row.is_nullable == "YES",
|
|
147
|
+
}
|
|
148
|
+
table_entities[tname]["fields"].append(field)
|
|
149
|
+
total_columns += 1
|
|
150
|
+
|
|
151
|
+
# --- Primary keys (BigQuery table constraints) ---
|
|
152
|
+
try:
|
|
153
|
+
pk_query = f"""
|
|
154
|
+
SELECT table_name, column_name
|
|
155
|
+
FROM `{project}.{dataset}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE`
|
|
156
|
+
WHERE constraint_name LIKE '%pk%' OR constraint_name LIKE '%primary%'
|
|
157
|
+
"""
|
|
158
|
+
pk_rows = client.query(pk_query).result()
|
|
159
|
+
for row in pk_rows:
|
|
160
|
+
if row.table_name in table_entities:
|
|
161
|
+
for f in table_entities[row.table_name]["fields"]:
|
|
162
|
+
if f["name"] == row.column_name:
|
|
163
|
+
f["primary_key"] = True
|
|
164
|
+
f["nullable"] = False
|
|
165
|
+
except Exception as e:
|
|
166
|
+
warnings.append(f"Could not fetch primary keys: {e}")
|
|
167
|
+
|
|
168
|
+
# --- Foreign keys ---
|
|
169
|
+
relationships: List[Dict[str, Any]] = []
|
|
170
|
+
try:
|
|
171
|
+
fk_query = f"""
|
|
172
|
+
SELECT
|
|
173
|
+
tc.table_name AS child_table,
|
|
174
|
+
kcu.column_name AS child_column,
|
|
175
|
+
ccu.table_name AS parent_table,
|
|
176
|
+
ccu.column_name AS parent_column,
|
|
177
|
+
tc.constraint_name
|
|
178
|
+
FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` tc
|
|
179
|
+
JOIN `{project}.{dataset}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` kcu
|
|
180
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
181
|
+
JOIN `{project}.{dataset}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` ccu
|
|
182
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
183
|
+
WHERE tc.constraint_type = 'FOREIGN KEY'
|
|
184
|
+
"""
|
|
185
|
+
fk_rows = client.query(fk_query).result()
|
|
186
|
+
for row in fk_rows:
|
|
187
|
+
parent_entity = self._entity_name(row.parent_table)
|
|
188
|
+
child_entity = self._entity_name(row.child_table)
|
|
189
|
+
if row.child_table in table_entities:
|
|
190
|
+
for f in table_entities[row.child_table]["fields"]:
|
|
191
|
+
if f["name"] == row.child_column:
|
|
192
|
+
f["foreign_key"] = True
|
|
193
|
+
relationships.append({
|
|
194
|
+
"name": row.constraint_name or f"{parent_entity.lower()}_{child_entity.lower()}_{row.child_column}_fk",
|
|
195
|
+
"from": f"{parent_entity}.{row.parent_column}",
|
|
196
|
+
"to": f"{child_entity}.{row.child_column}",
|
|
197
|
+
"cardinality": "one_to_many",
|
|
198
|
+
})
|
|
199
|
+
except Exception as e:
|
|
200
|
+
warnings.append(f"Could not fetch foreign keys: {e}")
|
|
201
|
+
|
|
202
|
+
entities_list = list(table_entities.values())
|
|
203
|
+
|
|
204
|
+
# --- Inference: fill in PKs and FKs when constraints are missing ---
|
|
205
|
+
has_any_pk = any(
|
|
206
|
+
f.get("primary_key") for ent in entities_list for f in ent.get("fields", [])
|
|
207
|
+
)
|
|
208
|
+
if not has_any_pk:
|
|
209
|
+
entities_list, pk_msgs = infer_primary_keys(entities_list)
|
|
210
|
+
warnings.extend(pk_msgs)
|
|
211
|
+
|
|
212
|
+
if not relationships:
|
|
213
|
+
inferred_rels, fk_msgs = infer_relationships(entities_list, relationships)
|
|
214
|
+
relationships.extend(inferred_rels)
|
|
215
|
+
warnings.extend(fk_msgs)
|
|
216
|
+
if inferred_rels:
|
|
217
|
+
warnings.insert(0, f"No FK constraints found — inferred {len(inferred_rels)} relationships from column naming patterns.")
|
|
218
|
+
|
|
219
|
+
model["entities"] = entities_list
|
|
220
|
+
model["relationships"] = relationships
|
|
221
|
+
|
|
222
|
+
return ConnectorResult(
|
|
223
|
+
model=model,
|
|
224
|
+
tables_found=len(table_entities),
|
|
225
|
+
columns_found=total_columns,
|
|
226
|
+
relationships_found=len(relationships),
|
|
227
|
+
indexes_found=0,
|
|
228
|
+
warnings=warnings,
|
|
229
|
+
)
|