datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Parse dbt profiles.yml to pick a warehouse connection for sync.
|
|
2
|
+
|
|
3
|
+
A dbt project has a `profile:` key in `dbt_project.yml`; that name indexes into
|
|
4
|
+
a `profiles.yml` (either in the project dir or `~/.dbt/profiles.yml`). Each
|
|
5
|
+
profile has a default `target:` and a map of named targets to connection
|
|
6
|
+
config. This module flattens that into a simple `(dialect, config)` tuple that
|
|
7
|
+
`datalex_core.dbt.warehouse.introspect_table()` can consume.
|
|
8
|
+
|
|
9
|
+
We deliberately do NOT import dbt itself. Users who only want to *try* DataLex
|
|
10
|
+
shouldn't need to install dbt just to read their manifest.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ProfileError(RuntimeError):
|
|
24
|
+
"""Raised when a profile is missing, malformed, or lacks a usable target."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ProfileTarget:
|
|
29
|
+
"""Resolved target: what `warehouse.introspect_table()` needs."""
|
|
30
|
+
|
|
31
|
+
profile_name: str
|
|
32
|
+
target_name: str
|
|
33
|
+
dialect: str
|
|
34
|
+
config: Dict[str, Any]
|
|
35
|
+
database: Optional[str]
|
|
36
|
+
schema: Optional[str]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def find_profiles_yml(
|
|
40
|
+
dbt_project_dir: Optional[str] = None,
|
|
41
|
+
explicit_path: Optional[str] = None,
|
|
42
|
+
) -> Path:
|
|
43
|
+
"""Locate profiles.yml using dbt's own precedence:
|
|
44
|
+
|
|
45
|
+
1. --profiles-dir / `explicit_path` (if provided)
|
|
46
|
+
2. DBT_PROFILES_DIR env var
|
|
47
|
+
3. `<dbt_project_dir>/profiles.yml`
|
|
48
|
+
4. `~/.dbt/profiles.yml`
|
|
49
|
+
"""
|
|
50
|
+
if explicit_path:
|
|
51
|
+
p = Path(explicit_path).expanduser()
|
|
52
|
+
if p.is_dir():
|
|
53
|
+
p = p / "profiles.yml"
|
|
54
|
+
if not p.exists():
|
|
55
|
+
raise ProfileError(f"profiles.yml not found at: {p}")
|
|
56
|
+
return p
|
|
57
|
+
|
|
58
|
+
env_dir = os.environ.get("DBT_PROFILES_DIR")
|
|
59
|
+
if env_dir:
|
|
60
|
+
p = Path(env_dir).expanduser() / "profiles.yml"
|
|
61
|
+
if p.exists():
|
|
62
|
+
return p
|
|
63
|
+
|
|
64
|
+
if dbt_project_dir:
|
|
65
|
+
p = Path(dbt_project_dir) / "profiles.yml"
|
|
66
|
+
if p.exists():
|
|
67
|
+
return p
|
|
68
|
+
|
|
69
|
+
home = Path.home() / ".dbt" / "profiles.yml"
|
|
70
|
+
if home.exists():
|
|
71
|
+
return home
|
|
72
|
+
|
|
73
|
+
raise ProfileError(
|
|
74
|
+
"Could not find profiles.yml. Looked in: "
|
|
75
|
+
"--profiles-dir, $DBT_PROFILES_DIR, <project>/profiles.yml, ~/.dbt/profiles.yml"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def read_dbt_project_profile_name(dbt_project_dir: str) -> str:
|
|
80
|
+
"""Return the `profile:` key from dbt_project.yml."""
|
|
81
|
+
p = Path(dbt_project_dir) / "dbt_project.yml"
|
|
82
|
+
if not p.exists():
|
|
83
|
+
raise ProfileError(f"dbt_project.yml not found in {dbt_project_dir}")
|
|
84
|
+
with p.open("r", encoding="utf-8") as f:
|
|
85
|
+
proj = yaml.safe_load(f) or {}
|
|
86
|
+
name = proj.get("profile")
|
|
87
|
+
if not name:
|
|
88
|
+
raise ProfileError(f"dbt_project.yml at {p} is missing a `profile:` key")
|
|
89
|
+
return str(name)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def resolve_target(
|
|
93
|
+
profiles_yml: Path,
|
|
94
|
+
profile_name: str,
|
|
95
|
+
target_override: Optional[str] = None,
|
|
96
|
+
base_dir: Optional[Path] = None,
|
|
97
|
+
) -> ProfileTarget:
|
|
98
|
+
"""Load profiles.yml, pick the named profile, and flatten the chosen target.
|
|
99
|
+
|
|
100
|
+
`base_dir` anchors relative paths (e.g. DuckDB `path:`) — typically the dbt
|
|
101
|
+
project directory. Defaults to the profiles.yml parent.
|
|
102
|
+
"""
|
|
103
|
+
with profiles_yml.open("r", encoding="utf-8") as f:
|
|
104
|
+
doc = yaml.safe_load(f) or {}
|
|
105
|
+
|
|
106
|
+
profile = doc.get(profile_name)
|
|
107
|
+
if not isinstance(profile, dict):
|
|
108
|
+
raise ProfileError(
|
|
109
|
+
f"profile '{profile_name}' not found in {profiles_yml}. "
|
|
110
|
+
f"Available: {sorted(k for k in doc.keys() if k != 'config')}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
outputs = profile.get("outputs") or {}
|
|
114
|
+
target_name = target_override or profile.get("target")
|
|
115
|
+
if not target_name:
|
|
116
|
+
raise ProfileError(
|
|
117
|
+
f"profile '{profile_name}' has no default `target:` and no --profile override"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
target = outputs.get(target_name)
|
|
121
|
+
if not isinstance(target, dict):
|
|
122
|
+
raise ProfileError(
|
|
123
|
+
f"target '{target_name}' not found in profile '{profile_name}'. "
|
|
124
|
+
f"Available: {sorted(outputs.keys())}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
dialect = str(target.get("type", "")).lower()
|
|
128
|
+
if not dialect:
|
|
129
|
+
raise ProfileError(
|
|
130
|
+
f"target '{target_name}' in profile '{profile_name}' is missing `type:`"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
config = dict(target)
|
|
134
|
+
anchor = base_dir or profiles_yml.parent
|
|
135
|
+
if dialect == "duckdb":
|
|
136
|
+
raw_path = config.get("path") or config.get("database")
|
|
137
|
+
if raw_path:
|
|
138
|
+
rp = Path(str(raw_path)).expanduser()
|
|
139
|
+
if not rp.is_absolute():
|
|
140
|
+
rp = (anchor / rp).resolve()
|
|
141
|
+
config["path"] = str(rp)
|
|
142
|
+
|
|
143
|
+
return ProfileTarget(
|
|
144
|
+
profile_name=profile_name,
|
|
145
|
+
target_name=target_name,
|
|
146
|
+
dialect=dialect,
|
|
147
|
+
config=config,
|
|
148
|
+
database=config.get("database") or config.get("dbname") or config.get("catalog"),
|
|
149
|
+
schema=config.get("schema") or config.get("dataset"),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def resolve_for_dbt_project(
|
|
154
|
+
dbt_project_dir: str,
|
|
155
|
+
profiles_dir: Optional[str] = None,
|
|
156
|
+
target_override: Optional[str] = None,
|
|
157
|
+
) -> ProfileTarget:
|
|
158
|
+
"""High-level: given a dbt project dir, resolve its active target.
|
|
159
|
+
|
|
160
|
+
Reads `dbt_project.yml` to find the profile name, then consults
|
|
161
|
+
`profiles.yml` to flatten the target.
|
|
162
|
+
"""
|
|
163
|
+
profile_name = read_dbt_project_profile_name(dbt_project_dir)
|
|
164
|
+
path = find_profiles_yml(dbt_project_dir=dbt_project_dir, explicit_path=profiles_dir)
|
|
165
|
+
return resolve_target(
|
|
166
|
+
path,
|
|
167
|
+
profile_name,
|
|
168
|
+
target_override=target_override,
|
|
169
|
+
base_dir=Path(dbt_project_dir).resolve(),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def as_introspect_args(
|
|
174
|
+
target: ProfileTarget,
|
|
175
|
+
database: Optional[str] = None,
|
|
176
|
+
schema: Optional[str] = None,
|
|
177
|
+
table: Optional[str] = None,
|
|
178
|
+
) -> Tuple[str, Dict[str, Any], str, str, str]:
|
|
179
|
+
"""Pack a resolved target + (db, schema, table) into the positional args
|
|
180
|
+
accepted by `warehouse.introspect_table()`. Falls back to the target's
|
|
181
|
+
default database/schema when a caller doesn't pass overrides."""
|
|
182
|
+
db = database or target.database or ""
|
|
183
|
+
sc = schema or target.schema or ""
|
|
184
|
+
tb = table or ""
|
|
185
|
+
return target.dialect, target.config, db, sc, tb
|
datalex_core/dbt/sync.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""`dbt sync` orchestrator — the adoption-shaped one-command flow.
|
|
2
|
+
|
|
3
|
+
Given a dbt project directory, sync pulls:
|
|
4
|
+
1. `target/manifest.json` (dbt compiles it with `dbt parse`; we just read it)
|
|
5
|
+
2. The warehouse columns for every source + model, via the active profile
|
|
6
|
+
|
|
7
|
+
And merges them into a DataLex project tree:
|
|
8
|
+
* user-authored fields (descriptions, tags, sensitivity, tests, etc.) are
|
|
9
|
+
preserved — manifest round-trip semantics from phase B
|
|
10
|
+
* `data_type` on every column comes from the warehouse when we can reach it,
|
|
11
|
+
otherwise from the manifest, otherwise left blank
|
|
12
|
+
* on re-sync, the `meta.datalex.dbt.unique_id` stable key means we never
|
|
13
|
+
duplicate entities
|
|
14
|
+
|
|
15
|
+
The flow is offline-safe: if the warehouse is unreachable (or the table hasn't
|
|
16
|
+
been built yet), we degrade to manifest-only columns and annotate a warning.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
25
|
+
|
|
26
|
+
from datalex_core.dbt.manifest import import_manifest, write_import_result
|
|
27
|
+
from datalex_core.dbt.profiles import (
|
|
28
|
+
ProfileError,
|
|
29
|
+
ProfileTarget,
|
|
30
|
+
resolve_for_dbt_project,
|
|
31
|
+
)
|
|
32
|
+
from datalex_core.dbt.warehouse import (
|
|
33
|
+
WarehouseColumn,
|
|
34
|
+
WarehouseError,
|
|
35
|
+
introspect_table,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ------------------------ report ------------------------
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class TableSyncRecord:
|
|
44
|
+
unique_id: str
|
|
45
|
+
kind: str # 'source' | 'model'
|
|
46
|
+
database: Optional[str]
|
|
47
|
+
schema: Optional[str]
|
|
48
|
+
table: str
|
|
49
|
+
warehouse_reachable: bool
|
|
50
|
+
columns_from_warehouse: int = 0
|
|
51
|
+
columns_from_manifest: int = 0
|
|
52
|
+
error: Optional[str] = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class SyncReport:
|
|
57
|
+
dbt_project: str
|
|
58
|
+
datalex_root: str
|
|
59
|
+
profile_name: Optional[str] = None
|
|
60
|
+
target_name: Optional[str] = None
|
|
61
|
+
dialect: Optional[str] = None
|
|
62
|
+
tables: List[TableSyncRecord] = field(default_factory=list)
|
|
63
|
+
files_written: List[str] = field(default_factory=list)
|
|
64
|
+
warnings: List[str] = field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
def summary(self) -> str:
|
|
67
|
+
reached = sum(1 for t in self.tables if t.warehouse_reachable)
|
|
68
|
+
lines = [
|
|
69
|
+
"dbt sync complete",
|
|
70
|
+
f" dbt project: {self.dbt_project}",
|
|
71
|
+
f" DataLex out: {self.datalex_root}",
|
|
72
|
+
f" profile: {self.profile_name} / {self.target_name} ({self.dialect})",
|
|
73
|
+
f" tables: {len(self.tables)} "
|
|
74
|
+
f"({reached} from warehouse, {len(self.tables) - reached} manifest-only)",
|
|
75
|
+
f" files: {len(self.files_written)} written",
|
|
76
|
+
]
|
|
77
|
+
if self.warnings:
|
|
78
|
+
lines.append(" warnings:")
|
|
79
|
+
for w in self.warnings:
|
|
80
|
+
lines.append(f" - {w}")
|
|
81
|
+
return "\n".join(lines)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ------------------------ public entry point ------------------------
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def sync_dbt_project(
|
|
88
|
+
dbt_project_dir: str,
|
|
89
|
+
datalex_root: str,
|
|
90
|
+
*,
|
|
91
|
+
profiles_dir: Optional[str] = None,
|
|
92
|
+
target_override: Optional[str] = None,
|
|
93
|
+
skip_warehouse: bool = False,
|
|
94
|
+
manifest_path: Optional[str] = None,
|
|
95
|
+
) -> SyncReport:
|
|
96
|
+
"""Run the full sync: manifest -> DataLex, enriched by live warehouse types.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
dbt_project_dir: Directory containing `dbt_project.yml` and
|
|
100
|
+
`target/manifest.json`.
|
|
101
|
+
datalex_root: Where to write the DataLex source/model YAML tree.
|
|
102
|
+
profiles_dir: Override for profiles.yml search (default: dbt's rules).
|
|
103
|
+
target_override: Pick a non-default target from the profile.
|
|
104
|
+
skip_warehouse: Skip live introspection; rely on manifest `data_type`.
|
|
105
|
+
manifest_path: Override `<dbt_project>/target/manifest.json`.
|
|
106
|
+
"""
|
|
107
|
+
dbt_dir = Path(dbt_project_dir)
|
|
108
|
+
out_root = Path(datalex_root)
|
|
109
|
+
manifest = Path(manifest_path) if manifest_path else dbt_dir / "target" / "manifest.json"
|
|
110
|
+
|
|
111
|
+
if not manifest.exists():
|
|
112
|
+
raise FileNotFoundError(
|
|
113
|
+
f"manifest.json not found at {manifest}. "
|
|
114
|
+
f"Run `dbt parse` (or `dbt compile`) in the dbt project first."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
report = SyncReport(dbt_project=str(dbt_dir), datalex_root=str(out_root))
|
|
118
|
+
|
|
119
|
+
# Step 1: parse manifest (merge-preserving re-import)
|
|
120
|
+
imported = import_manifest(str(manifest), existing_project_root=str(out_root))
|
|
121
|
+
|
|
122
|
+
# Step 2: resolve warehouse target (optional)
|
|
123
|
+
target: Optional[ProfileTarget] = None
|
|
124
|
+
if not skip_warehouse:
|
|
125
|
+
try:
|
|
126
|
+
target = resolve_for_dbt_project(
|
|
127
|
+
str(dbt_dir),
|
|
128
|
+
profiles_dir=profiles_dir,
|
|
129
|
+
target_override=target_override,
|
|
130
|
+
)
|
|
131
|
+
report.profile_name = target.profile_name
|
|
132
|
+
report.target_name = target.target_name
|
|
133
|
+
report.dialect = target.dialect
|
|
134
|
+
except ProfileError as e:
|
|
135
|
+
report.warnings.append(f"profile lookup failed — manifest-only sync: {e}")
|
|
136
|
+
|
|
137
|
+
# Step 3: introspect each source/model and enrich columns
|
|
138
|
+
for source_doc in imported.sources.values():
|
|
139
|
+
for table_doc in source_doc.get("tables", []) or []:
|
|
140
|
+
rec = _enrich_table(
|
|
141
|
+
table_doc,
|
|
142
|
+
database=source_doc.get("database"),
|
|
143
|
+
schema=source_doc.get("schema"),
|
|
144
|
+
target=target,
|
|
145
|
+
kind="source",
|
|
146
|
+
)
|
|
147
|
+
report.tables.append(rec)
|
|
148
|
+
|
|
149
|
+
for model_doc in imported.models.values():
|
|
150
|
+
rec = _enrich_table(
|
|
151
|
+
model_doc,
|
|
152
|
+
database=model_doc.get("database"),
|
|
153
|
+
schema=model_doc.get("schema"),
|
|
154
|
+
target=target,
|
|
155
|
+
kind="model",
|
|
156
|
+
)
|
|
157
|
+
report.tables.append(rec)
|
|
158
|
+
|
|
159
|
+
# Step 4: write the DataLex tree
|
|
160
|
+
report.files_written = write_import_result(imported, str(out_root))
|
|
161
|
+
|
|
162
|
+
return report
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ------------------------ per-table enrichment ------------------------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _enrich_table(
|
|
169
|
+
table_doc: Dict[str, Any],
|
|
170
|
+
*,
|
|
171
|
+
database: Optional[str],
|
|
172
|
+
schema: Optional[str],
|
|
173
|
+
target: Optional[ProfileTarget],
|
|
174
|
+
kind: str,
|
|
175
|
+
) -> TableSyncRecord:
|
|
176
|
+
uid = (
|
|
177
|
+
(table_doc.get("meta") or {})
|
|
178
|
+
.get("datalex", {})
|
|
179
|
+
.get("dbt", {})
|
|
180
|
+
.get("unique_id", "")
|
|
181
|
+
)
|
|
182
|
+
table_name = table_doc.get("identifier") or table_doc.get("name") or ""
|
|
183
|
+
|
|
184
|
+
rec = TableSyncRecord(
|
|
185
|
+
unique_id=uid,
|
|
186
|
+
kind=kind,
|
|
187
|
+
database=database,
|
|
188
|
+
schema=schema,
|
|
189
|
+
table=table_name,
|
|
190
|
+
warehouse_reachable=False,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
manifest_cols = list(table_doc.get("columns") or [])
|
|
194
|
+
rec.columns_from_manifest = sum(1 for c in manifest_cols if c.get("type"))
|
|
195
|
+
|
|
196
|
+
if target is None or not schema or not table_name:
|
|
197
|
+
return rec
|
|
198
|
+
|
|
199
|
+
db = database or target.database or ""
|
|
200
|
+
try:
|
|
201
|
+
wh_cols = introspect_table(
|
|
202
|
+
dialect=target.dialect,
|
|
203
|
+
config=target.config,
|
|
204
|
+
database=db,
|
|
205
|
+
schema=schema,
|
|
206
|
+
table=table_name,
|
|
207
|
+
)
|
|
208
|
+
except WarehouseError as e:
|
|
209
|
+
rec.error = str(e)
|
|
210
|
+
return rec
|
|
211
|
+
except Exception as e:
|
|
212
|
+
rec.error = f"{type(e).__name__}: {e}"
|
|
213
|
+
return rec
|
|
214
|
+
|
|
215
|
+
rec.warehouse_reachable = True
|
|
216
|
+
merged = _merge_warehouse_into_columns(manifest_cols, wh_cols)
|
|
217
|
+
if merged:
|
|
218
|
+
table_doc["columns"] = merged
|
|
219
|
+
rec.columns_from_warehouse = len(wh_cols)
|
|
220
|
+
return rec
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _merge_warehouse_into_columns(
|
|
224
|
+
manifest_cols: List[Dict[str, Any]],
|
|
225
|
+
wh_cols: List[WarehouseColumn],
|
|
226
|
+
) -> List[Dict[str, Any]]:
|
|
227
|
+
"""Warehouse = authoritative for type + nullability + order.
|
|
228
|
+
Manifest/prior DataLex doc = authoritative for everything else
|
|
229
|
+
(description, sensitivity, tags, tests, constraints, etc.)."""
|
|
230
|
+
by_name = {c.get("name"): dict(c) for c in manifest_cols if c.get("name")}
|
|
231
|
+
out: List[Dict[str, Any]] = []
|
|
232
|
+
for wh in wh_cols:
|
|
233
|
+
existing = by_name.pop(wh.name, {"name": wh.name})
|
|
234
|
+
existing["type"] = wh.data_type
|
|
235
|
+
if wh.nullable is False:
|
|
236
|
+
existing["nullable"] = False
|
|
237
|
+
elif "nullable" in existing and existing["nullable"] is True:
|
|
238
|
+
existing.pop("nullable")
|
|
239
|
+
if wh.description and "description" not in existing:
|
|
240
|
+
existing["description"] = wh.description
|
|
241
|
+
out.append(existing)
|
|
242
|
+
|
|
243
|
+
# Any manifest-only columns (e.g. view not yet materialized) keep their
|
|
244
|
+
# place at the end so we don't drop user-authored metadata.
|
|
245
|
+
for leftover in by_name.values():
|
|
246
|
+
out.append(leftover)
|
|
247
|
+
return out
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ------------------------ lightweight JSON view ------------------------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def report_to_json(report: SyncReport) -> str:
|
|
254
|
+
return json.dumps(
|
|
255
|
+
{
|
|
256
|
+
"dbt_project": report.dbt_project,
|
|
257
|
+
"datalex_root": report.datalex_root,
|
|
258
|
+
"profile_name": report.profile_name,
|
|
259
|
+
"target_name": report.target_name,
|
|
260
|
+
"dialect": report.dialect,
|
|
261
|
+
"tables": [
|
|
262
|
+
{
|
|
263
|
+
"unique_id": t.unique_id,
|
|
264
|
+
"kind": t.kind,
|
|
265
|
+
"database": t.database,
|
|
266
|
+
"schema": t.schema,
|
|
267
|
+
"table": t.table,
|
|
268
|
+
"warehouse_reachable": t.warehouse_reachable,
|
|
269
|
+
"columns_from_warehouse": t.columns_from_warehouse,
|
|
270
|
+
"columns_from_manifest": t.columns_from_manifest,
|
|
271
|
+
"error": t.error,
|
|
272
|
+
}
|
|
273
|
+
for t in report.tables
|
|
274
|
+
],
|
|
275
|
+
"files_written": report.files_written,
|
|
276
|
+
"warnings": report.warnings,
|
|
277
|
+
},
|
|
278
|
+
indent=2,
|
|
279
|
+
)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Warehouse introspection for dbt sync.
|
|
2
|
+
|
|
3
|
+
Given a dialect + connection config + a (database, schema, table) triple,
|
|
4
|
+
return the column list the warehouse actually has. Kept narrow on purpose —
|
|
5
|
+
the existing connectors in datalex_core/connectors/ do full schema discovery; for
|
|
6
|
+
sync we only need per-table column introspection so we can backfill types
|
|
7
|
+
into DataLex files.
|
|
8
|
+
|
|
9
|
+
Supported dialects (v1):
|
|
10
|
+
* duckdb — file-based, no setup (the zero-friction demo path)
|
|
11
|
+
* postgres — information_schema.columns (psycopg2)
|
|
12
|
+
|
|
13
|
+
Other dialects fall back to the existing full-pull connector and filter.
|
|
14
|
+
The fallback is slower but means `dbt sync` works against any warehouse that
|
|
15
|
+
already has a connector implementation — users don't have to wait for us to
|
|
16
|
+
ship a bespoke path.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class WarehouseColumn:
|
|
27
|
+
name: str
|
|
28
|
+
data_type: str
|
|
29
|
+
nullable: bool = True
|
|
30
|
+
description: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class WarehouseError(RuntimeError):
|
|
34
|
+
"""Raised when warehouse introspection fails or is unsupported."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def introspect_table(
|
|
38
|
+
dialect: str,
|
|
39
|
+
config: Dict[str, Any],
|
|
40
|
+
database: str,
|
|
41
|
+
schema: str,
|
|
42
|
+
table: str,
|
|
43
|
+
) -> List[WarehouseColumn]:
|
|
44
|
+
"""Return the live column list for one table.
|
|
45
|
+
|
|
46
|
+
`config` is a dbt-profile-shaped dict — e.g. `{path: "/tmp/db.duckdb"}` for
|
|
47
|
+
duckdb, `{host, port, user, password, dbname}` for postgres. Per-dialect
|
|
48
|
+
functions know how to pick the keys they need.
|
|
49
|
+
"""
|
|
50
|
+
dialect = dialect.lower()
|
|
51
|
+
if dialect == "duckdb":
|
|
52
|
+
return _introspect_duckdb(config, database, schema, table)
|
|
53
|
+
if dialect in ("postgres", "postgresql"):
|
|
54
|
+
return _introspect_postgres(config, database, schema, table)
|
|
55
|
+
raise WarehouseError(
|
|
56
|
+
f"dialect '{dialect}' is not supported yet for `dbt sync`. "
|
|
57
|
+
f"Supported: duckdb, postgres. "
|
|
58
|
+
f"Open an issue or contribute a driver under datalex_core/dbt/warehouse.py."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ------------------------ DuckDB ------------------------
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _introspect_duckdb(
|
|
66
|
+
config: Dict[str, Any],
|
|
67
|
+
database: str,
|
|
68
|
+
schema: str,
|
|
69
|
+
table: str,
|
|
70
|
+
) -> List[WarehouseColumn]:
|
|
71
|
+
try:
|
|
72
|
+
import duckdb # type: ignore
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise WarehouseError(
|
|
75
|
+
"DuckDB driver not installed. Run: pip install duckdb"
|
|
76
|
+
) from e
|
|
77
|
+
|
|
78
|
+
path = config.get("path") or config.get("database")
|
|
79
|
+
if not path:
|
|
80
|
+
raise WarehouseError("DuckDB profile needs a `path:` pointing at the .duckdb file.")
|
|
81
|
+
|
|
82
|
+
conn = duckdb.connect(str(path), read_only=True)
|
|
83
|
+
try:
|
|
84
|
+
# duckdb_columns() is the stable introspection view
|
|
85
|
+
rows = conn.execute(
|
|
86
|
+
"""
|
|
87
|
+
SELECT column_name, data_type, is_nullable
|
|
88
|
+
FROM information_schema.columns
|
|
89
|
+
WHERE table_schema = ? AND table_name = ?
|
|
90
|
+
ORDER BY ordinal_position
|
|
91
|
+
""",
|
|
92
|
+
[schema, table],
|
|
93
|
+
).fetchall()
|
|
94
|
+
finally:
|
|
95
|
+
conn.close()
|
|
96
|
+
|
|
97
|
+
return [
|
|
98
|
+
WarehouseColumn(
|
|
99
|
+
name=r[0],
|
|
100
|
+
data_type=_normalize_type(str(r[1])),
|
|
101
|
+
nullable=(str(r[2]).upper() == "YES"),
|
|
102
|
+
)
|
|
103
|
+
for r in rows
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ------------------------ Postgres ------------------------
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _introspect_postgres(
|
|
111
|
+
config: Dict[str, Any],
|
|
112
|
+
database: str,
|
|
113
|
+
schema: str,
|
|
114
|
+
table: str,
|
|
115
|
+
) -> List[WarehouseColumn]:
|
|
116
|
+
try:
|
|
117
|
+
import psycopg2 # type: ignore
|
|
118
|
+
except ImportError as e:
|
|
119
|
+
raise WarehouseError(
|
|
120
|
+
"Postgres driver not installed. Run: pip install psycopg2-binary"
|
|
121
|
+
) from e
|
|
122
|
+
|
|
123
|
+
# dbt uses `dbname` (sometimes `database`) + `host`/`port`/`user`/`password`.
|
|
124
|
+
conn = psycopg2.connect(
|
|
125
|
+
host=config.get("host", "localhost"),
|
|
126
|
+
port=int(config.get("port", 5432)),
|
|
127
|
+
user=config.get("user") or config.get("username") or "",
|
|
128
|
+
password=config.get("password", ""),
|
|
129
|
+
dbname=config.get("dbname") or config.get("database") or database,
|
|
130
|
+
)
|
|
131
|
+
try:
|
|
132
|
+
cur = conn.cursor()
|
|
133
|
+
cur.execute(
|
|
134
|
+
"""
|
|
135
|
+
SELECT column_name, data_type, is_nullable, col_description(
|
|
136
|
+
('"' || table_schema || '"."' || table_name || '"')::regclass,
|
|
137
|
+
ordinal_position
|
|
138
|
+
)
|
|
139
|
+
FROM information_schema.columns
|
|
140
|
+
WHERE table_schema = %s AND table_name = %s
|
|
141
|
+
ORDER BY ordinal_position
|
|
142
|
+
""",
|
|
143
|
+
(schema, table),
|
|
144
|
+
)
|
|
145
|
+
rows = cur.fetchall()
|
|
146
|
+
cur.close()
|
|
147
|
+
finally:
|
|
148
|
+
conn.close()
|
|
149
|
+
|
|
150
|
+
return [
|
|
151
|
+
WarehouseColumn(
|
|
152
|
+
name=r[0],
|
|
153
|
+
data_type=_normalize_type(str(r[1])),
|
|
154
|
+
nullable=(str(r[2]).upper() == "YES"),
|
|
155
|
+
description=r[3] if r[3] else None,
|
|
156
|
+
)
|
|
157
|
+
for r in rows
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ------------------------ type normalization ------------------------
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
_TYPE_ALIASES = {
|
|
165
|
+
"character varying": "string",
|
|
166
|
+
"varchar": "string",
|
|
167
|
+
"text": "string",
|
|
168
|
+
"character": "string",
|
|
169
|
+
"char": "string",
|
|
170
|
+
"double precision": "double",
|
|
171
|
+
"double": "double",
|
|
172
|
+
"real": "float",
|
|
173
|
+
"numeric": "decimal",
|
|
174
|
+
"integer": "int",
|
|
175
|
+
"int4": "int",
|
|
176
|
+
"int8": "bigint",
|
|
177
|
+
"bigint": "bigint",
|
|
178
|
+
"smallint": "smallint",
|
|
179
|
+
"int2": "smallint",
|
|
180
|
+
"boolean": "boolean",
|
|
181
|
+
"bool": "boolean",
|
|
182
|
+
"timestamp without time zone": "timestamp",
|
|
183
|
+
"timestamp with time zone": "timestamp_tz",
|
|
184
|
+
"timestamp": "timestamp",
|
|
185
|
+
"date": "date",
|
|
186
|
+
"time": "time",
|
|
187
|
+
"uuid": "uuid",
|
|
188
|
+
"json": "json",
|
|
189
|
+
"jsonb": "json",
|
|
190
|
+
"bytea": "binary",
|
|
191
|
+
"blob": "binary",
|
|
192
|
+
"decimal": "decimal",
|
|
193
|
+
"hugeint": "bigint",
|
|
194
|
+
"utinyint": "smallint",
|
|
195
|
+
"usmallint": "int",
|
|
196
|
+
"uinteger": "bigint",
|
|
197
|
+
"ubigint": "bigint",
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _normalize_type(raw: str) -> str:
|
|
202
|
+
"""Fold warehouse-specific type names to the DataLex canonical palette.
|
|
203
|
+
|
|
204
|
+
Unknown types pass through unchanged — the DataLex layer is permissive
|
|
205
|
+
about types at the physical layer.
|
|
206
|
+
"""
|
|
207
|
+
raw_l = raw.lower().strip()
|
|
208
|
+
if raw_l in _TYPE_ALIASES:
|
|
209
|
+
return _TYPE_ALIASES[raw_l]
|
|
210
|
+
# Preserve parametric types: "varchar(255)" → "string(255)"
|
|
211
|
+
if "(" in raw_l:
|
|
212
|
+
head, tail = raw_l.split("(", 1)
|
|
213
|
+
base = _TYPE_ALIASES.get(head.strip(), head.strip())
|
|
214
|
+
return f"{base}({tail}"
|
|
215
|
+
return raw_l
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Dialect plugin registry.
|
|
2
|
+
|
|
3
|
+
Each SQL/NoSQL target engine ships as a module under this package implementing the
|
|
4
|
+
DialectPlugin protocol in `base.py`. The registry in `registry.py` is the single
|
|
5
|
+
entry point for code that wants to emit DDL or type-map without knowing which
|
|
6
|
+
dialect is in play.
|
|
7
|
+
|
|
8
|
+
Ports in Phase A: postgres, snowflake. The legacy monolithic `generators.py`
|
|
9
|
+
remains available as a fallback and continues to serve bigquery/databricks/mysql/
|
|
10
|
+
sqlserver until those are ported (Phase A task 5).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from datalex_core.dialects import base, registry, postgres, snowflake # noqa: F401
|
|
14
|
+
|
|
15
|
+
__all__ = ["base", "registry"]
|