datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,185 @@
1
+ """Parse dbt profiles.yml to pick a warehouse connection for sync.
2
+
3
+ A dbt project has a `profile:` key in `dbt_project.yml`; that name indexes into
4
+ a `profiles.yml` (either in the project dir or `~/.dbt/profiles.yml`). Each
5
+ profile has a default `target:` and a map of named targets to connection
6
+ config. This module flattens that into a simple `(dialect, config)` tuple that
7
+ `datalex_core.dbt.warehouse.introspect_table()` can consume.
8
+
9
+ We deliberately do NOT import dbt itself. Users who only want to *try* DataLex
10
+ shouldn't need to install dbt just to read their manifest.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any, Dict, Optional, Tuple
19
+
20
+ import yaml
21
+
22
+
23
+ class ProfileError(RuntimeError):
24
+ """Raised when a profile is missing, malformed, or lacks a usable target."""
25
+
26
+
27
+ @dataclass
28
+ class ProfileTarget:
29
+ """Resolved target: what `warehouse.introspect_table()` needs."""
30
+
31
+ profile_name: str
32
+ target_name: str
33
+ dialect: str
34
+ config: Dict[str, Any]
35
+ database: Optional[str]
36
+ schema: Optional[str]
37
+
38
+
39
+ def find_profiles_yml(
40
+ dbt_project_dir: Optional[str] = None,
41
+ explicit_path: Optional[str] = None,
42
+ ) -> Path:
43
+ """Locate profiles.yml using dbt's own precedence:
44
+
45
+ 1. --profiles-dir / `explicit_path` (if provided)
46
+ 2. DBT_PROFILES_DIR env var
47
+ 3. `<dbt_project_dir>/profiles.yml`
48
+ 4. `~/.dbt/profiles.yml`
49
+ """
50
+ if explicit_path:
51
+ p = Path(explicit_path).expanduser()
52
+ if p.is_dir():
53
+ p = p / "profiles.yml"
54
+ if not p.exists():
55
+ raise ProfileError(f"profiles.yml not found at: {p}")
56
+ return p
57
+
58
+ env_dir = os.environ.get("DBT_PROFILES_DIR")
59
+ if env_dir:
60
+ p = Path(env_dir).expanduser() / "profiles.yml"
61
+ if p.exists():
62
+ return p
63
+
64
+ if dbt_project_dir:
65
+ p = Path(dbt_project_dir) / "profiles.yml"
66
+ if p.exists():
67
+ return p
68
+
69
+ home = Path.home() / ".dbt" / "profiles.yml"
70
+ if home.exists():
71
+ return home
72
+
73
+ raise ProfileError(
74
+ "Could not find profiles.yml. Looked in: "
75
+ "--profiles-dir, $DBT_PROFILES_DIR, <project>/profiles.yml, ~/.dbt/profiles.yml"
76
+ )
77
+
78
+
79
+ def read_dbt_project_profile_name(dbt_project_dir: str) -> str:
80
+ """Return the `profile:` key from dbt_project.yml."""
81
+ p = Path(dbt_project_dir) / "dbt_project.yml"
82
+ if not p.exists():
83
+ raise ProfileError(f"dbt_project.yml not found in {dbt_project_dir}")
84
+ with p.open("r", encoding="utf-8") as f:
85
+ proj = yaml.safe_load(f) or {}
86
+ name = proj.get("profile")
87
+ if not name:
88
+ raise ProfileError(f"dbt_project.yml at {p} is missing a `profile:` key")
89
+ return str(name)
90
+
91
+
92
+ def resolve_target(
93
+ profiles_yml: Path,
94
+ profile_name: str,
95
+ target_override: Optional[str] = None,
96
+ base_dir: Optional[Path] = None,
97
+ ) -> ProfileTarget:
98
+ """Load profiles.yml, pick the named profile, and flatten the chosen target.
99
+
100
+ `base_dir` anchors relative paths (e.g. DuckDB `path:`) — typically the dbt
101
+ project directory. Defaults to the profiles.yml parent.
102
+ """
103
+ with profiles_yml.open("r", encoding="utf-8") as f:
104
+ doc = yaml.safe_load(f) or {}
105
+
106
+ profile = doc.get(profile_name)
107
+ if not isinstance(profile, dict):
108
+ raise ProfileError(
109
+ f"profile '{profile_name}' not found in {profiles_yml}. "
110
+ f"Available: {sorted(k for k in doc.keys() if k != 'config')}"
111
+ )
112
+
113
+ outputs = profile.get("outputs") or {}
114
+ target_name = target_override or profile.get("target")
115
+ if not target_name:
116
+ raise ProfileError(
117
+ f"profile '{profile_name}' has no default `target:` and no --profile override"
118
+ )
119
+
120
+ target = outputs.get(target_name)
121
+ if not isinstance(target, dict):
122
+ raise ProfileError(
123
+ f"target '{target_name}' not found in profile '{profile_name}'. "
124
+ f"Available: {sorted(outputs.keys())}"
125
+ )
126
+
127
+ dialect = str(target.get("type", "")).lower()
128
+ if not dialect:
129
+ raise ProfileError(
130
+ f"target '{target_name}' in profile '{profile_name}' is missing `type:`"
131
+ )
132
+
133
+ config = dict(target)
134
+ anchor = base_dir or profiles_yml.parent
135
+ if dialect == "duckdb":
136
+ raw_path = config.get("path") or config.get("database")
137
+ if raw_path:
138
+ rp = Path(str(raw_path)).expanduser()
139
+ if not rp.is_absolute():
140
+ rp = (anchor / rp).resolve()
141
+ config["path"] = str(rp)
142
+
143
+ return ProfileTarget(
144
+ profile_name=profile_name,
145
+ target_name=target_name,
146
+ dialect=dialect,
147
+ config=config,
148
+ database=config.get("database") or config.get("dbname") or config.get("catalog"),
149
+ schema=config.get("schema") or config.get("dataset"),
150
+ )
151
+
152
+
153
+ def resolve_for_dbt_project(
154
+ dbt_project_dir: str,
155
+ profiles_dir: Optional[str] = None,
156
+ target_override: Optional[str] = None,
157
+ ) -> ProfileTarget:
158
+ """High-level: given a dbt project dir, resolve its active target.
159
+
160
+ Reads `dbt_project.yml` to find the profile name, then consults
161
+ `profiles.yml` to flatten the target.
162
+ """
163
+ profile_name = read_dbt_project_profile_name(dbt_project_dir)
164
+ path = find_profiles_yml(dbt_project_dir=dbt_project_dir, explicit_path=profiles_dir)
165
+ return resolve_target(
166
+ path,
167
+ profile_name,
168
+ target_override=target_override,
169
+ base_dir=Path(dbt_project_dir).resolve(),
170
+ )
171
+
172
+
173
+ def as_introspect_args(
174
+ target: ProfileTarget,
175
+ database: Optional[str] = None,
176
+ schema: Optional[str] = None,
177
+ table: Optional[str] = None,
178
+ ) -> Tuple[str, Dict[str, Any], str, str, str]:
179
+ """Pack a resolved target + (db, schema, table) into the positional args
180
+ accepted by `warehouse.introspect_table()`. Falls back to the target's
181
+ default database/schema when a caller doesn't pass overrides."""
182
+ db = database or target.database or ""
183
+ sc = schema or target.schema or ""
184
+ tb = table or ""
185
+ return target.dialect, target.config, db, sc, tb
@@ -0,0 +1,279 @@
1
+ """`dbt sync` orchestrator — the adoption-shaped one-command flow.
2
+
3
+ Given a dbt project directory, sync pulls:
4
+ 1. `target/manifest.json` (dbt compiles it with `dbt parse`; we just read it)
5
+ 2. The warehouse columns for every source + model, via the active profile
6
+
7
+ And merges them into a DataLex project tree:
8
+ * user-authored fields (descriptions, tags, sensitivity, tests, etc.) are
9
+ preserved — manifest round-trip semantics from phase B
10
+ * `data_type` on every column comes from the warehouse when we can reach it,
11
+ otherwise from the manifest, otherwise left blank
12
+ * on re-sync, the `meta.datalex.dbt.unique_id` stable key means we never
13
+ duplicate entities
14
+
15
+ The flow is offline-safe: if the warehouse is unreachable (or the table hasn't
16
+ been built yet), we degrade to manifest-only columns and annotate a warning.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from dataclasses import dataclass, field
23
+ from pathlib import Path
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+ from datalex_core.dbt.manifest import import_manifest, write_import_result
27
+ from datalex_core.dbt.profiles import (
28
+ ProfileError,
29
+ ProfileTarget,
30
+ resolve_for_dbt_project,
31
+ )
32
+ from datalex_core.dbt.warehouse import (
33
+ WarehouseColumn,
34
+ WarehouseError,
35
+ introspect_table,
36
+ )
37
+
38
+
39
+ # ------------------------ report ------------------------
40
+
41
+
42
+ @dataclass
43
+ class TableSyncRecord:
44
+ unique_id: str
45
+ kind: str # 'source' | 'model'
46
+ database: Optional[str]
47
+ schema: Optional[str]
48
+ table: str
49
+ warehouse_reachable: bool
50
+ columns_from_warehouse: int = 0
51
+ columns_from_manifest: int = 0
52
+ error: Optional[str] = None
53
+
54
+
55
+ @dataclass
56
+ class SyncReport:
57
+ dbt_project: str
58
+ datalex_root: str
59
+ profile_name: Optional[str] = None
60
+ target_name: Optional[str] = None
61
+ dialect: Optional[str] = None
62
+ tables: List[TableSyncRecord] = field(default_factory=list)
63
+ files_written: List[str] = field(default_factory=list)
64
+ warnings: List[str] = field(default_factory=list)
65
+
66
+ def summary(self) -> str:
67
+ reached = sum(1 for t in self.tables if t.warehouse_reachable)
68
+ lines = [
69
+ "dbt sync complete",
70
+ f" dbt project: {self.dbt_project}",
71
+ f" DataLex out: {self.datalex_root}",
72
+ f" profile: {self.profile_name} / {self.target_name} ({self.dialect})",
73
+ f" tables: {len(self.tables)} "
74
+ f"({reached} from warehouse, {len(self.tables) - reached} manifest-only)",
75
+ f" files: {len(self.files_written)} written",
76
+ ]
77
+ if self.warnings:
78
+ lines.append(" warnings:")
79
+ for w in self.warnings:
80
+ lines.append(f" - {w}")
81
+ return "\n".join(lines)
82
+
83
+
84
+ # ------------------------ public entry point ------------------------
85
+
86
+
87
+ def sync_dbt_project(
88
+ dbt_project_dir: str,
89
+ datalex_root: str,
90
+ *,
91
+ profiles_dir: Optional[str] = None,
92
+ target_override: Optional[str] = None,
93
+ skip_warehouse: bool = False,
94
+ manifest_path: Optional[str] = None,
95
+ ) -> SyncReport:
96
+ """Run the full sync: manifest -> DataLex, enriched by live warehouse types.
97
+
98
+ Args:
99
+ dbt_project_dir: Directory containing `dbt_project.yml` and
100
+ `target/manifest.json`.
101
+ datalex_root: Where to write the DataLex source/model YAML tree.
102
+ profiles_dir: Override for profiles.yml search (default: dbt's rules).
103
+ target_override: Pick a non-default target from the profile.
104
+ skip_warehouse: Skip live introspection; rely on manifest `data_type`.
105
+ manifest_path: Override `<dbt_project>/target/manifest.json`.
106
+ """
107
+ dbt_dir = Path(dbt_project_dir)
108
+ out_root = Path(datalex_root)
109
+ manifest = Path(manifest_path) if manifest_path else dbt_dir / "target" / "manifest.json"
110
+
111
+ if not manifest.exists():
112
+ raise FileNotFoundError(
113
+ f"manifest.json not found at {manifest}. "
114
+ f"Run `dbt parse` (or `dbt compile`) in the dbt project first."
115
+ )
116
+
117
+ report = SyncReport(dbt_project=str(dbt_dir), datalex_root=str(out_root))
118
+
119
+ # Step 1: parse manifest (merge-preserving re-import)
120
+ imported = import_manifest(str(manifest), existing_project_root=str(out_root))
121
+
122
+ # Step 2: resolve warehouse target (optional)
123
+ target: Optional[ProfileTarget] = None
124
+ if not skip_warehouse:
125
+ try:
126
+ target = resolve_for_dbt_project(
127
+ str(dbt_dir),
128
+ profiles_dir=profiles_dir,
129
+ target_override=target_override,
130
+ )
131
+ report.profile_name = target.profile_name
132
+ report.target_name = target.target_name
133
+ report.dialect = target.dialect
134
+ except ProfileError as e:
135
+ report.warnings.append(f"profile lookup failed — manifest-only sync: {e}")
136
+
137
+ # Step 3: introspect each source/model and enrich columns
138
+ for source_doc in imported.sources.values():
139
+ for table_doc in source_doc.get("tables", []) or []:
140
+ rec = _enrich_table(
141
+ table_doc,
142
+ database=source_doc.get("database"),
143
+ schema=source_doc.get("schema"),
144
+ target=target,
145
+ kind="source",
146
+ )
147
+ report.tables.append(rec)
148
+
149
+ for model_doc in imported.models.values():
150
+ rec = _enrich_table(
151
+ model_doc,
152
+ database=model_doc.get("database"),
153
+ schema=model_doc.get("schema"),
154
+ target=target,
155
+ kind="model",
156
+ )
157
+ report.tables.append(rec)
158
+
159
+ # Step 4: write the DataLex tree
160
+ report.files_written = write_import_result(imported, str(out_root))
161
+
162
+ return report
163
+
164
+
165
+ # ------------------------ per-table enrichment ------------------------
166
+
167
+
168
+ def _enrich_table(
169
+ table_doc: Dict[str, Any],
170
+ *,
171
+ database: Optional[str],
172
+ schema: Optional[str],
173
+ target: Optional[ProfileTarget],
174
+ kind: str,
175
+ ) -> TableSyncRecord:
176
+ uid = (
177
+ (table_doc.get("meta") or {})
178
+ .get("datalex", {})
179
+ .get("dbt", {})
180
+ .get("unique_id", "")
181
+ )
182
+ table_name = table_doc.get("identifier") or table_doc.get("name") or ""
183
+
184
+ rec = TableSyncRecord(
185
+ unique_id=uid,
186
+ kind=kind,
187
+ database=database,
188
+ schema=schema,
189
+ table=table_name,
190
+ warehouse_reachable=False,
191
+ )
192
+
193
+ manifest_cols = list(table_doc.get("columns") or [])
194
+ rec.columns_from_manifest = sum(1 for c in manifest_cols if c.get("type"))
195
+
196
+ if target is None or not schema or not table_name:
197
+ return rec
198
+
199
+ db = database or target.database or ""
200
+ try:
201
+ wh_cols = introspect_table(
202
+ dialect=target.dialect,
203
+ config=target.config,
204
+ database=db,
205
+ schema=schema,
206
+ table=table_name,
207
+ )
208
+ except WarehouseError as e:
209
+ rec.error = str(e)
210
+ return rec
211
+ except Exception as e:
212
+ rec.error = f"{type(e).__name__}: {e}"
213
+ return rec
214
+
215
+ rec.warehouse_reachable = True
216
+ merged = _merge_warehouse_into_columns(manifest_cols, wh_cols)
217
+ if merged:
218
+ table_doc["columns"] = merged
219
+ rec.columns_from_warehouse = len(wh_cols)
220
+ return rec
221
+
222
+
223
+ def _merge_warehouse_into_columns(
224
+ manifest_cols: List[Dict[str, Any]],
225
+ wh_cols: List[WarehouseColumn],
226
+ ) -> List[Dict[str, Any]]:
227
+ """Warehouse = authoritative for type + nullability + order.
228
+ Manifest/prior DataLex doc = authoritative for everything else
229
+ (description, sensitivity, tags, tests, constraints, etc.)."""
230
+ by_name = {c.get("name"): dict(c) for c in manifest_cols if c.get("name")}
231
+ out: List[Dict[str, Any]] = []
232
+ for wh in wh_cols:
233
+ existing = by_name.pop(wh.name, {"name": wh.name})
234
+ existing["type"] = wh.data_type
235
+ if wh.nullable is False:
236
+ existing["nullable"] = False
237
+ elif "nullable" in existing and existing["nullable"] is True:
238
+ existing.pop("nullable")
239
+ if wh.description and "description" not in existing:
240
+ existing["description"] = wh.description
241
+ out.append(existing)
242
+
243
+ # Any manifest-only columns (e.g. view not yet materialized) keep their
244
+ # place at the end so we don't drop user-authored metadata.
245
+ for leftover in by_name.values():
246
+ out.append(leftover)
247
+ return out
248
+
249
+
250
+ # ------------------------ lightweight JSON view ------------------------
251
+
252
+
253
+ def report_to_json(report: SyncReport) -> str:
254
+ return json.dumps(
255
+ {
256
+ "dbt_project": report.dbt_project,
257
+ "datalex_root": report.datalex_root,
258
+ "profile_name": report.profile_name,
259
+ "target_name": report.target_name,
260
+ "dialect": report.dialect,
261
+ "tables": [
262
+ {
263
+ "unique_id": t.unique_id,
264
+ "kind": t.kind,
265
+ "database": t.database,
266
+ "schema": t.schema,
267
+ "table": t.table,
268
+ "warehouse_reachable": t.warehouse_reachable,
269
+ "columns_from_warehouse": t.columns_from_warehouse,
270
+ "columns_from_manifest": t.columns_from_manifest,
271
+ "error": t.error,
272
+ }
273
+ for t in report.tables
274
+ ],
275
+ "files_written": report.files_written,
276
+ "warnings": report.warnings,
277
+ },
278
+ indent=2,
279
+ )
@@ -0,0 +1,215 @@
1
+ """Warehouse introspection for dbt sync.
2
+
3
+ Given a dialect + connection config + a (database, schema, table) triple,
4
+ return the column list the warehouse actually has. Kept narrow on purpose —
5
+ the existing connectors in datalex_core/connectors/ do full schema discovery; for
6
+ sync we only need per-table column introspection so we can backfill types
7
+ into DataLex files.
8
+
9
+ Supported dialects (v1):
10
+ * duckdb — file-based, no setup (the zero-friction demo path)
11
+ * postgres — information_schema.columns (psycopg2)
12
+
13
+ Other dialects fall back to the existing full-pull connector and filter.
14
+ The fallback is slower but means `dbt sync` works against any warehouse that
15
+ already has a connector implementation — users don't have to wait for us to
16
+ ship a bespoke path.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import dataclass
22
+ from typing import Any, Dict, List, Optional
23
+
24
+
25
+ @dataclass
26
+ class WarehouseColumn:
27
+ name: str
28
+ data_type: str
29
+ nullable: bool = True
30
+ description: Optional[str] = None
31
+
32
+
33
+ class WarehouseError(RuntimeError):
34
+ """Raised when warehouse introspection fails or is unsupported."""
35
+
36
+
37
+ def introspect_table(
38
+ dialect: str,
39
+ config: Dict[str, Any],
40
+ database: str,
41
+ schema: str,
42
+ table: str,
43
+ ) -> List[WarehouseColumn]:
44
+ """Return the live column list for one table.
45
+
46
+ `config` is a dbt-profile-shaped dict — e.g. `{path: "/tmp/db.duckdb"}` for
47
+ duckdb, `{host, port, user, password, dbname}` for postgres. Per-dialect
48
+ functions know how to pick the keys they need.
49
+ """
50
+ dialect = dialect.lower()
51
+ if dialect == "duckdb":
52
+ return _introspect_duckdb(config, database, schema, table)
53
+ if dialect in ("postgres", "postgresql"):
54
+ return _introspect_postgres(config, database, schema, table)
55
+ raise WarehouseError(
56
+ f"dialect '{dialect}' is not supported yet for `dbt sync`. "
57
+ f"Supported: duckdb, postgres. "
58
+ f"Open an issue or contribute a driver under datalex_core/dbt/warehouse.py."
59
+ )
60
+
61
+
62
+ # ------------------------ DuckDB ------------------------
63
+
64
+
65
+ def _introspect_duckdb(
66
+ config: Dict[str, Any],
67
+ database: str,
68
+ schema: str,
69
+ table: str,
70
+ ) -> List[WarehouseColumn]:
71
+ try:
72
+ import duckdb # type: ignore
73
+ except ImportError as e:
74
+ raise WarehouseError(
75
+ "DuckDB driver not installed. Run: pip install duckdb"
76
+ ) from e
77
+
78
+ path = config.get("path") or config.get("database")
79
+ if not path:
80
+ raise WarehouseError("DuckDB profile needs a `path:` pointing at the .duckdb file.")
81
+
82
+ conn = duckdb.connect(str(path), read_only=True)
83
+ try:
84
+ # duckdb_columns() is the stable introspection view
85
+ rows = conn.execute(
86
+ """
87
+ SELECT column_name, data_type, is_nullable
88
+ FROM information_schema.columns
89
+ WHERE table_schema = ? AND table_name = ?
90
+ ORDER BY ordinal_position
91
+ """,
92
+ [schema, table],
93
+ ).fetchall()
94
+ finally:
95
+ conn.close()
96
+
97
+ return [
98
+ WarehouseColumn(
99
+ name=r[0],
100
+ data_type=_normalize_type(str(r[1])),
101
+ nullable=(str(r[2]).upper() == "YES"),
102
+ )
103
+ for r in rows
104
+ ]
105
+
106
+
107
+ # ------------------------ Postgres ------------------------
108
+
109
+
110
+ def _introspect_postgres(
111
+ config: Dict[str, Any],
112
+ database: str,
113
+ schema: str,
114
+ table: str,
115
+ ) -> List[WarehouseColumn]:
116
+ try:
117
+ import psycopg2 # type: ignore
118
+ except ImportError as e:
119
+ raise WarehouseError(
120
+ "Postgres driver not installed. Run: pip install psycopg2-binary"
121
+ ) from e
122
+
123
+ # dbt uses `dbname` (sometimes `database`) + `host`/`port`/`user`/`password`.
124
+ conn = psycopg2.connect(
125
+ host=config.get("host", "localhost"),
126
+ port=int(config.get("port", 5432)),
127
+ user=config.get("user") or config.get("username") or "",
128
+ password=config.get("password", ""),
129
+ dbname=config.get("dbname") or config.get("database") or database,
130
+ )
131
+ try:
132
+ cur = conn.cursor()
133
+ cur.execute(
134
+ """
135
+ SELECT column_name, data_type, is_nullable, col_description(
136
+ ('"' || table_schema || '"."' || table_name || '"')::regclass,
137
+ ordinal_position
138
+ )
139
+ FROM information_schema.columns
140
+ WHERE table_schema = %s AND table_name = %s
141
+ ORDER BY ordinal_position
142
+ """,
143
+ (schema, table),
144
+ )
145
+ rows = cur.fetchall()
146
+ cur.close()
147
+ finally:
148
+ conn.close()
149
+
150
+ return [
151
+ WarehouseColumn(
152
+ name=r[0],
153
+ data_type=_normalize_type(str(r[1])),
154
+ nullable=(str(r[2]).upper() == "YES"),
155
+ description=r[3] if r[3] else None,
156
+ )
157
+ for r in rows
158
+ ]
159
+
160
+
161
+ # ------------------------ type normalization ------------------------
162
+
163
+
164
+ _TYPE_ALIASES = {
165
+ "character varying": "string",
166
+ "varchar": "string",
167
+ "text": "string",
168
+ "character": "string",
169
+ "char": "string",
170
+ "double precision": "double",
171
+ "double": "double",
172
+ "real": "float",
173
+ "numeric": "decimal",
174
+ "integer": "int",
175
+ "int4": "int",
176
+ "int8": "bigint",
177
+ "bigint": "bigint",
178
+ "smallint": "smallint",
179
+ "int2": "smallint",
180
+ "boolean": "boolean",
181
+ "bool": "boolean",
182
+ "timestamp without time zone": "timestamp",
183
+ "timestamp with time zone": "timestamp_tz",
184
+ "timestamp": "timestamp",
185
+ "date": "date",
186
+ "time": "time",
187
+ "uuid": "uuid",
188
+ "json": "json",
189
+ "jsonb": "json",
190
+ "bytea": "binary",
191
+ "blob": "binary",
192
+ "decimal": "decimal",
193
+ "hugeint": "bigint",
194
+ "utinyint": "smallint",
195
+ "usmallint": "int",
196
+ "uinteger": "bigint",
197
+ "ubigint": "bigint",
198
+ }
199
+
200
+
201
+ def _normalize_type(raw: str) -> str:
202
+ """Fold warehouse-specific type names to the DataLex canonical palette.
203
+
204
+ Unknown types pass through unchanged — the DataLex layer is permissive
205
+ about types at the physical layer.
206
+ """
207
+ raw_l = raw.lower().strip()
208
+ if raw_l in _TYPE_ALIASES:
209
+ return _TYPE_ALIASES[raw_l]
210
+ # Preserve parametric types: "varchar(255)" → "string(255)"
211
+ if "(" in raw_l:
212
+ head, tail = raw_l.split("(", 1)
213
+ base = _TYPE_ALIASES.get(head.strip(), head.strip())
214
+ return f"{base}({tail}"
215
+ return raw_l
@@ -0,0 +1,15 @@
1
+ """Dialect plugin registry.
2
+
3
+ Each SQL/NoSQL target engine ships as a module under this package implementing the
4
+ DialectPlugin protocol in `base.py`. The registry in `registry.py` is the single
5
+ entry point for code that wants to emit DDL or type-map without knowing which
6
+ dialect is in play.
7
+
8
+ Ports in Phase A: postgres, snowflake. The legacy monolithic `generators.py`
9
+ remains available as a fallback and continues to serve bigquery/databricks/mysql/
10
+ sqlserver until those are ported (Phase A task 5).
11
+ """
12
+
13
+ from datalex_core.dialects import base, registry, postgres, snowflake # noqa: F401
14
+
15
+ __all__ = ["base", "registry"]