datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,417 @@
1
+ """Base connector interface and registry for database connectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass, field
8
+ from datetime import date
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+
12
+ @dataclass
13
+ class ConnectorConfig:
14
+ """Configuration for a database connector."""
15
+
16
+ connector_type: str
17
+ host: str = ""
18
+ port: int = 0
19
+ database: str = ""
20
+ schema: str = ""
21
+ user: str = ""
22
+ password: str = ""
23
+ warehouse: str = ""
24
+ project: str = ""
25
+ dataset: str = ""
26
+ catalog: str = ""
27
+ token: str = ""
28
+ private_key_path: str = ""
29
+ connection_string: str = ""
30
+ tables: Optional[List[str]] = None
31
+ exclude_tables: Optional[List[str]] = None
32
+ model_name: str = "imported_model"
33
+ domain: str = "imported"
34
+ owners: Optional[List[str]] = None
35
+ extra: Dict[str, Any] = field(default_factory=dict)
36
+
37
+ def effective_owners(self) -> List[str]:
38
+ return self.owners or ["data-team@example.com"]
39
+
40
+
41
+ @dataclass
42
+ class ConnectorResult:
43
+ """Result of a schema pull operation."""
44
+
45
+ model: Dict[str, Any]
46
+ tables_found: int = 0
47
+ columns_found: int = 0
48
+ relationships_found: int = 0
49
+ indexes_found: int = 0
50
+ warnings: List[str] = field(default_factory=list)
51
+
52
+ def summary(self) -> str:
53
+ lines = [
54
+ f"Tables: {self.tables_found}",
55
+ f"Columns: {self.columns_found}",
56
+ f"Relationships: {self.relationships_found}",
57
+ f"Indexes: {self.indexes_found}",
58
+ ]
59
+ if self.warnings:
60
+ lines.append(f"Warnings: {len(self.warnings)}")
61
+ for w in self.warnings:
62
+ lines.append(f" - {w}")
63
+ return "\n".join(lines)
64
+
65
+
66
+ def _to_pascal(name: str) -> str:
67
+ name = name.replace('"', "")
68
+ parts = re.split(r"[^A-Za-z0-9]+", name)
69
+ return "".join(part[:1].upper() + part[1:] for part in parts if part)
70
+
71
+
72
+ def _to_model_name(text: str) -> str:
73
+ cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()
74
+ return cleaned or "imported_model"
75
+
76
+
77
+ def _default_model(model_name: str, domain: str, owners: List[str]) -> Dict[str, Any]:
78
+ return {
79
+ "model": {
80
+ "name": _to_model_name(model_name),
81
+ "version": "1.0.0",
82
+ "domain": domain,
83
+ "owners": owners,
84
+ "state": "draft",
85
+ },
86
+ "entities": [],
87
+ "relationships": [],
88
+ "indexes": [],
89
+ "governance": {"classification": {}, "stewards": {}},
90
+ "rules": [],
91
+ }
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Relationship & PK inference for databases without constraints
96
+ # ---------------------------------------------------------------------------
97
+
98
+ # Common PK column patterns (case-insensitive)
99
+ _PK_PATTERNS = [
100
+ re.compile(r"^id$", re.IGNORECASE),
101
+ re.compile(r"^pk$", re.IGNORECASE),
102
+ re.compile(r"^(.+)_id$", re.IGNORECASE), # only if matches table name
103
+ re.compile(r"^(.+)_pk$", re.IGNORECASE),
104
+ ]
105
+
106
+ # Common FK column patterns: <table>_id, <table>_fk, <table>Id, fk_<table>
107
+ _FK_PATTERNS = [
108
+ re.compile(r"^(.+)_id$", re.IGNORECASE),
109
+ re.compile(r"^(.+)_fk$", re.IGNORECASE),
110
+ re.compile(r"^fk_(.+)$", re.IGNORECASE),
111
+ re.compile(r"^(.+)Id$"), # camelCase: orderId, userId
112
+ ]
113
+
114
+
115
+ def _normalize(name: str) -> str:
116
+ """Normalize a name for fuzzy matching: lowercase, strip underscores/hyphens."""
117
+ return re.sub(r"[_\-\s]+", "", name).lower()
118
+
119
+
120
+ def _plurals(name: str) -> List[str]:
121
+ """Return plausible singular/plural variants of a normalized name."""
122
+ variants = []
123
+ if name.endswith("ies"):
124
+ variants.append(name[:-3] + "y") # "categories" → "category"
125
+ if name.endswith("ses") or name.endswith("xes") or name.endswith("zes"):
126
+ variants.append(name[:-2]) # "addresses" → "address"
127
+ if name.endswith("s") and not name.endswith("ss"):
128
+ variants.append(name[:-1]) # "orders" → "order"
129
+ if not name.endswith("s"):
130
+ variants.append(name + "s") # "order" → "orders"
131
+ if name.endswith("y") and not name.endswith("ey"):
132
+ variants.append(name[:-1] + "ies") # "category" → "categories"
133
+ return variants
134
+
135
+
136
+ def _build_entity_lookup(entities: List[Dict[str, Any]]) -> Dict[str, str]:
137
+ """Build a lookup from normalized table/entity name → actual entity name.
138
+
139
+ Includes both the entity name and singular/plural variants for flexible matching.
140
+ """
141
+ lookup: Dict[str, str] = {}
142
+ for entity in entities:
143
+ ename = entity["name"]
144
+ norm = _normalize(ename)
145
+ lookup[norm] = ename
146
+ for variant in _plurals(norm):
147
+ if variant not in lookup:
148
+ lookup[variant] = ename
149
+ return lookup
150
+
151
+
152
+ def infer_primary_keys(entities: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[str]]:
153
+ """Infer primary keys for entities that have no PK defined.
154
+
155
+ Heuristics (in priority order):
156
+ 1. Column named exactly 'id'
157
+ 2. Column named '<table_name>_id' (e.g., order_id in orders table)
158
+ 3. Column named '<table_name>_pk'
159
+ 4. First column if it ends with '_id' or '_pk'
160
+
161
+ Returns (modified entities, list of inference messages).
162
+ """
163
+ messages: List[str] = []
164
+
165
+ for entity in entities:
166
+ fields = entity.get("fields", [])
167
+ # Skip if any field already has primary_key
168
+ if any(f.get("primary_key") for f in fields):
169
+ continue
170
+ if not fields:
171
+ continue
172
+
173
+ ename_norm = _normalize(entity["name"])
174
+ inferred_pk = None
175
+
176
+ # Priority 1: column named 'id'
177
+ for f in fields:
178
+ if f["name"].lower() == "id":
179
+ inferred_pk = f
180
+ break
181
+
182
+ # Priority 2: column named '<entity>_id' or '<entity>_pk'
183
+ if not inferred_pk:
184
+ for f in fields:
185
+ fname = f["name"].lower()
186
+ if fname == f"{ename_norm}_id" or fname == f"{ename_norm}_pk":
187
+ inferred_pk = f
188
+ break
189
+
190
+ # Priority 3: first column ending in _id or _pk
191
+ if not inferred_pk:
192
+ for f in fields:
193
+ fname = f["name"].lower()
194
+ if fname.endswith("_id") or fname.endswith("_pk"):
195
+ inferred_pk = f
196
+ break
197
+
198
+ if inferred_pk:
199
+ inferred_pk["primary_key"] = True
200
+ inferred_pk["nullable"] = False
201
+ messages.append(f"Inferred PK: {entity['name']}.{inferred_pk['name']}")
202
+
203
+ return entities, messages
204
+
205
+
206
+ def infer_relationships(
207
+ entities: List[Dict[str, Any]],
208
+ existing_relationships: Optional[List[Dict[str, Any]]] = None,
209
+ ) -> Tuple[List[Dict[str, Any]], List[str]]:
210
+ """Infer foreign key relationships from column naming conventions.
211
+
212
+ Detects patterns like:
213
+ - user_id in orders table → Orders.user_id references Users.id
214
+ - customer_fk in invoices → Invoices.customer_fk references Customers.id
215
+ - fk_product in line_items → LineItems.fk_product references Products.id
216
+
217
+ Only creates relationships to entities that actually exist in the model.
218
+ Skips columns that are already marked as primary_key.
219
+
220
+ Returns (list of inferred relationships, list of inference messages).
221
+ """
222
+ existing = existing_relationships or []
223
+ existing_pairs = set()
224
+ for rel in existing:
225
+ existing_pairs.add((rel.get("from", ""), rel.get("to", "")))
226
+
227
+ entity_lookup = _build_entity_lookup(entities)
228
+
229
+ # Build a map of entity_name → its PK field name
230
+ pk_map: Dict[str, str] = {}
231
+ for entity in entities:
232
+ for f in entity.get("fields", []):
233
+ if f.get("primary_key"):
234
+ pk_map[entity["name"]] = f["name"]
235
+ break
236
+ # Default to 'id' if no PK found
237
+ if entity["name"] not in pk_map:
238
+ pk_map[entity["name"]] = "id"
239
+
240
+ inferred: List[Dict[str, Any]] = []
241
+ messages: List[str] = []
242
+
243
+ for entity in entities:
244
+ entity_name = entity["name"]
245
+ for f in entity.get("fields", []):
246
+ # Skip fields already marked as PK
247
+ if f.get("primary_key"):
248
+ continue
249
+ # Skip fields already marked as FK
250
+ if f.get("foreign_key"):
251
+ continue
252
+
253
+ fname = f["name"]
254
+ ref_table_norm = None
255
+
256
+ # Try each FK pattern
257
+ for pattern in _FK_PATTERNS:
258
+ m = pattern.match(fname)
259
+ if m:
260
+ ref_table_norm = _normalize(m.group(1))
261
+ break
262
+
263
+ if not ref_table_norm:
264
+ continue
265
+
266
+ # Don't self-reference via the entity's own name_id pattern
267
+ if ref_table_norm == _normalize(entity_name):
268
+ continue
269
+
270
+ # Look up the referenced entity
271
+ ref_entity = entity_lookup.get(ref_table_norm)
272
+ if not ref_entity:
273
+ continue
274
+
275
+ # Build the relationship
276
+ ref_pk = pk_map.get(ref_entity, "id")
277
+ from_key = f"{ref_entity}.{ref_pk}"
278
+ to_key = f"{entity_name}.{fname}"
279
+
280
+ # Skip if this relationship already exists
281
+ if (from_key, to_key) in existing_pairs:
282
+ continue
283
+
284
+ f["foreign_key"] = True
285
+ rel_name = f"{_normalize(ref_entity)}_{_normalize(entity_name)}_{fname}_inferred"
286
+ inferred.append({
287
+ "name": rel_name,
288
+ "from": from_key,
289
+ "to": to_key,
290
+ "cardinality": "one_to_many",
291
+ "inferred": True,
292
+ })
293
+ existing_pairs.add((from_key, to_key))
294
+ messages.append(f"Inferred FK: {entity_name}.{fname} → {ref_entity}.{ref_pk}")
295
+
296
+ return inferred, messages
297
+
298
+
299
+ class BaseConnector(ABC):
300
+ """Abstract base class for all database connectors."""
301
+
302
+ connector_type: str = ""
303
+ display_name: str = ""
304
+ required_package: str = ""
305
+
306
+ @abstractmethod
307
+ def test_connection(self, config: ConnectorConfig) -> Tuple[bool, str]:
308
+ """Test if the connection can be established.
309
+
310
+ Returns (success, message).
311
+ """
312
+
313
+ @abstractmethod
314
+ def pull_schema(self, config: ConnectorConfig) -> ConnectorResult:
315
+ """Pull schema from the database and return a ConnectorResult."""
316
+
317
+ def list_schemas(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
318
+ """List available schemas/datasets in the database.
319
+
320
+ Returns a list of dicts with at least: {"name": str, "table_count": int}.
321
+ Override in subclasses.
322
+ """
323
+ return []
324
+
325
+ def list_tables(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
326
+ """List tables in the configured schema.
327
+
328
+ Returns a list of dicts with at least:
329
+ {"name": str, "type": str, "row_count": int|None, "column_count": int}.
330
+ Override in subclasses.
331
+ """
332
+ return []
333
+
334
+ def check_driver(self) -> Tuple[bool, str]:
335
+ """Check if the required Python driver package is installed."""
336
+ if not self.required_package:
337
+ return True, "No driver required"
338
+ try:
339
+ __import__(self.required_package)
340
+ return True, f"{self.required_package} is installed"
341
+ except ImportError:
342
+ return False, f"Missing driver: pip install {self.required_package}"
343
+
344
+ def _build_model(self, config: ConnectorConfig) -> Dict[str, Any]:
345
+ return _default_model(
346
+ model_name=config.model_name,
347
+ domain=config.domain,
348
+ owners=config.effective_owners(),
349
+ )
350
+
351
+ def _entity_name(self, table_name: str) -> str:
352
+ return _to_pascal(table_name)
353
+
354
+ def _should_include_table(self, table_name: str, config: ConnectorConfig) -> bool:
355
+ if config.tables and table_name not in config.tables:
356
+ return False
357
+ if config.exclude_tables and table_name in config.exclude_tables:
358
+ return False
359
+ return True
360
+
361
+
362
+ # ---------------------------------------------------------------------------
363
+ # Connector registry
364
+ # ---------------------------------------------------------------------------
365
+
366
+ _REGISTRY: Dict[str, BaseConnector] = {}
367
+
368
+
369
+ def _register(connector: BaseConnector) -> None:
370
+ _REGISTRY[connector.connector_type] = connector
371
+
372
+
373
+ def get_connector(connector_type: str) -> Optional[BaseConnector]:
374
+ """Get a connector by type name."""
375
+ return _REGISTRY.get(connector_type)
376
+
377
+
378
+ def list_connectors() -> List[Dict[str, str]]:
379
+ """List all registered connectors."""
380
+ result = []
381
+ for name, conn in sorted(_REGISTRY.items()):
382
+ ok, msg = conn.check_driver()
383
+ result.append({
384
+ "type": name,
385
+ "name": conn.display_name,
386
+ "driver": conn.required_package or "none",
387
+ "installed": ok,
388
+ "status": msg,
389
+ })
390
+ return result
391
+
392
+
393
+ def register_all() -> None:
394
+ """Register all built-in connectors."""
395
+ from datalex_core.connectors.postgres import PostgresConnector
396
+ from datalex_core.connectors.mysql import MySQLConnector
397
+ from datalex_core.connectors.snowflake import SnowflakeConnector
398
+ from datalex_core.connectors.bigquery import BigQueryConnector
399
+ from datalex_core.connectors.databricks import DatabricksConnector
400
+ from datalex_core.connectors.sqlserver import SQLServerConnector, AzureSQLConnector, AzureFabricConnector
401
+ from datalex_core.connectors.redshift import RedshiftConnector
402
+
403
+ for cls in [
404
+ PostgresConnector,
405
+ MySQLConnector,
406
+ SnowflakeConnector,
407
+ BigQueryConnector,
408
+ DatabricksConnector,
409
+ SQLServerConnector,
410
+ AzureSQLConnector,
411
+ AzureFabricConnector,
412
+ RedshiftConnector,
413
+ ]:
414
+ _register(cls())
415
+
416
+
417
+ register_all()
@@ -0,0 +1,229 @@
1
+ """BigQuery connector — pulls schema from INFORMATION_SCHEMA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import date
6
+ from typing import Any, Dict, List, Tuple
7
+
8
+ from datalex_core.connectors.base import BaseConnector, ConnectorConfig, ConnectorResult, infer_primary_keys, infer_relationships
9
+
10
+
11
+ _BQ_TYPE_MAP = {
12
+ "STRING": "string",
13
+ "BYTES": "binary",
14
+ "INT64": "bigint",
15
+ "INTEGER": "integer",
16
+ "FLOAT64": "float",
17
+ "FLOAT": "float",
18
+ "NUMERIC": "decimal",
19
+ "BIGNUMERIC": "decimal",
20
+ "BOOLEAN": "boolean",
21
+ "BOOL": "boolean",
22
+ "TIMESTAMP": "timestamp",
23
+ "DATE": "date",
24
+ "TIME": "time",
25
+ "DATETIME": "timestamp",
26
+ "GEOGRAPHY": "string",
27
+ "RECORD": "json",
28
+ "STRUCT": "json",
29
+ "ARRAY": "json",
30
+ "JSON": "json",
31
+ }
32
+
33
+
34
+ class BigQueryConnector(BaseConnector):
35
+ connector_type = "bigquery"
36
+ display_name = "Google BigQuery"
37
+ required_package = "google.cloud.bigquery"
38
+
39
+ def test_connection(self, config: ConnectorConfig) -> Tuple[bool, str]:
40
+ try:
41
+ from google.cloud import bigquery
42
+ client = bigquery.Client(project=config.project)
43
+ datasets = list(client.list_datasets(max_results=1))
44
+ return True, "Connection successful"
45
+ except ImportError:
46
+ return False, "google-cloud-bigquery not installed. Run: pip install google-cloud-bigquery"
47
+ except Exception as e:
48
+ return False, f"Connection failed: {e}"
49
+
50
+ def list_schemas(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
51
+ from google.cloud import bigquery
52
+ client = bigquery.Client(project=config.project)
53
+ results = []
54
+ for ds in client.list_datasets():
55
+ ds_ref = ds.reference
56
+ try:
57
+ tables = list(client.list_tables(ds_ref))
58
+ count = len(tables)
59
+ except Exception:
60
+ count = 0
61
+ results.append({"name": ds.dataset_id, "table_count": count})
62
+ return sorted(results, key=lambda x: x["name"])
63
+
64
+ def list_tables(self, config: ConnectorConfig) -> List[Dict[str, Any]]:
65
+ from google.cloud import bigquery
66
+ client = bigquery.Client(project=config.project)
67
+ dataset = config.dataset
68
+ if not dataset:
69
+ return []
70
+ results = []
71
+ for tbl in client.list_tables(f"{config.project}.{dataset}"):
72
+ ttype = "view" if tbl.table_type == "VIEW" else "table"
73
+ # Get column count
74
+ try:
75
+ full = client.get_table(tbl.reference)
76
+ col_count = len(full.schema)
77
+ row_count = full.num_rows
78
+ except Exception:
79
+ col_count = 0
80
+ row_count = None
81
+ results.append({"name": tbl.table_id, "type": ttype, "column_count": col_count, "row_count": row_count})
82
+ return sorted(results, key=lambda x: x["name"])
83
+
84
+ def pull_schema(self, config: ConnectorConfig) -> ConnectorResult:
85
+ from google.cloud import bigquery
86
+
87
+ client = bigquery.Client(project=config.project)
88
+ return self._pull(client, config)
89
+
90
+ def _pull(self, client: Any, config: ConnectorConfig) -> ConnectorResult:
91
+ model = self._build_model(config)
92
+ project = config.project
93
+ dataset = config.dataset
94
+ warnings: List[str] = []
95
+
96
+ if not dataset:
97
+ warnings.append("No dataset specified. Use --dataset to filter.")
98
+ return ConnectorResult(model=model, warnings=warnings)
99
+
100
+ # --- Tables ---
101
+ query = f"""
102
+ SELECT table_name, table_type
103
+ FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`
104
+ ORDER BY table_name
105
+ """
106
+ rows = client.query(query).result()
107
+
108
+ table_entities: Dict[str, Dict[str, Any]] = {}
109
+ for row in rows:
110
+ table_name = row.table_name
111
+ table_type = row.table_type
112
+ if not self._should_include_table(table_name, config):
113
+ continue
114
+ entity_name = self._entity_name(table_name)
115
+ entity_type = "view" if "VIEW" in table_type else "table"
116
+ table_entities[table_name] = {
117
+ "name": entity_name,
118
+ "physical_name": table_name,
119
+ "type": entity_type,
120
+ "description": f"Pulled from BigQuery {project}.{dataset}.{table_name} on {date.today().isoformat()}",
121
+ "fields": [],
122
+ "schema": dataset,
123
+ "database": project,
124
+ }
125
+
126
+ # --- Columns ---
127
+ query = f"""
128
+ SELECT table_name, column_name, data_type, is_nullable
129
+ FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
130
+ ORDER BY table_name, ordinal_position
131
+ """
132
+ col_rows = client.query(query).result()
133
+ total_columns = 0
134
+
135
+ for row in col_rows:
136
+ tname = row.table_name
137
+ if tname not in table_entities:
138
+ continue
139
+
140
+ data_type = row.data_type or "STRING"
141
+ dl_type = _BQ_TYPE_MAP.get(data_type.upper(), "string")
142
+
143
+ field: Dict[str, Any] = {
144
+ "name": row.column_name,
145
+ "type": dl_type,
146
+ "nullable": row.is_nullable == "YES",
147
+ }
148
+ table_entities[tname]["fields"].append(field)
149
+ total_columns += 1
150
+
151
+ # --- Primary keys (BigQuery table constraints) ---
152
+ try:
153
+ pk_query = f"""
154
+ SELECT table_name, column_name
155
+ FROM `{project}.{dataset}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE`
156
+ WHERE constraint_name LIKE '%pk%' OR constraint_name LIKE '%primary%'
157
+ """
158
+ pk_rows = client.query(pk_query).result()
159
+ for row in pk_rows:
160
+ if row.table_name in table_entities:
161
+ for f in table_entities[row.table_name]["fields"]:
162
+ if f["name"] == row.column_name:
163
+ f["primary_key"] = True
164
+ f["nullable"] = False
165
+ except Exception as e:
166
+ warnings.append(f"Could not fetch primary keys: {e}")
167
+
168
+ # --- Foreign keys ---
169
+ relationships: List[Dict[str, Any]] = []
170
+ try:
171
+ fk_query = f"""
172
+ SELECT
173
+ tc.table_name AS child_table,
174
+ kcu.column_name AS child_column,
175
+ ccu.table_name AS parent_table,
176
+ ccu.column_name AS parent_column,
177
+ tc.constraint_name
178
+ FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` tc
179
+ JOIN `{project}.{dataset}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` kcu
180
+ ON tc.constraint_name = kcu.constraint_name
181
+ JOIN `{project}.{dataset}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` ccu
182
+ ON tc.constraint_name = ccu.constraint_name
183
+ WHERE tc.constraint_type = 'FOREIGN KEY'
184
+ """
185
+ fk_rows = client.query(fk_query).result()
186
+ for row in fk_rows:
187
+ parent_entity = self._entity_name(row.parent_table)
188
+ child_entity = self._entity_name(row.child_table)
189
+ if row.child_table in table_entities:
190
+ for f in table_entities[row.child_table]["fields"]:
191
+ if f["name"] == row.child_column:
192
+ f["foreign_key"] = True
193
+ relationships.append({
194
+ "name": row.constraint_name or f"{parent_entity.lower()}_{child_entity.lower()}_{row.child_column}_fk",
195
+ "from": f"{parent_entity}.{row.parent_column}",
196
+ "to": f"{child_entity}.{row.child_column}",
197
+ "cardinality": "one_to_many",
198
+ })
199
+ except Exception as e:
200
+ warnings.append(f"Could not fetch foreign keys: {e}")
201
+
202
+ entities_list = list(table_entities.values())
203
+
204
+ # --- Inference: fill in PKs and FKs when constraints are missing ---
205
+ has_any_pk = any(
206
+ f.get("primary_key") for ent in entities_list for f in ent.get("fields", [])
207
+ )
208
+ if not has_any_pk:
209
+ entities_list, pk_msgs = infer_primary_keys(entities_list)
210
+ warnings.extend(pk_msgs)
211
+
212
+ if not relationships:
213
+ inferred_rels, fk_msgs = infer_relationships(entities_list, relationships)
214
+ relationships.extend(inferred_rels)
215
+ warnings.extend(fk_msgs)
216
+ if inferred_rels:
217
+ warnings.insert(0, f"No FK constraints found — inferred {len(inferred_rels)} relationships from column naming patterns.")
218
+
219
+ model["entities"] = entities_list
220
+ model["relationships"] = relationships
221
+
222
+ return ConnectorResult(
223
+ model=model,
224
+ tables_found=len(table_entities),
225
+ columns_found=total_columns,
226
+ relationships_found=len(relationships),
227
+ indexes_found=0,
228
+ warnings=warnings,
229
+ )