datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
datalex_core/dbt/emit.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""dbt YAML emitter.
|
|
2
|
+
|
|
3
|
+
Given a loaded DataLexProject, emits:
|
|
4
|
+
* sources/<source_name>.yml — one file per `kind: source`
|
|
5
|
+
* models/_schema.yml — schema.yml for every `kind: model`
|
|
6
|
+
|
|
7
|
+
Output is dbt v2 format and includes:
|
|
8
|
+
* contracts (`config.contract.enforced: true` with `data_type` per column)
|
|
9
|
+
* column-level constraints (primary_key / unique / not_null / foreign_key / check)
|
|
10
|
+
* tests (unique / not_null / accepted_values / relationships / custom)
|
|
11
|
+
* freshness (at source level and per-table)
|
|
12
|
+
* meta round-trip via `meta.datalex.*` so reimports never clobber user intent
|
|
13
|
+
|
|
14
|
+
The dict payloads returned by `build_sources_yaml` / `build_models_yaml` are plain,
|
|
15
|
+
serialization-ready dicts — callers choose how to write them (single file, per-file,
|
|
16
|
+
etc.) via `write_dbt_yaml`.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
24
|
+
|
|
25
|
+
import yaml
|
|
26
|
+
|
|
27
|
+
from datalex_core.datalex.project import DataLexProject
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ------------------------ build payloads ------------------------
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def build_sources_yaml(project: DataLexProject) -> Dict[str, Dict[str, Any]]:
|
|
34
|
+
"""Return { relative_path: source_doc } for every `kind: source` file.
|
|
35
|
+
|
|
36
|
+
We split by source name so dbt's `source(name, table)` reference stays stable
|
|
37
|
+
and edit-friendly: sources/<name>.yml.
|
|
38
|
+
"""
|
|
39
|
+
out: Dict[str, Dict[str, Any]] = {}
|
|
40
|
+
for src in project.sources.values():
|
|
41
|
+
doc = _source_to_dict(src)
|
|
42
|
+
out[f"sources/{src['name']}.yml"] = {"version": 2, "sources": [doc]}
|
|
43
|
+
return out
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_models_yaml(project: DataLexProject) -> Dict[str, Dict[str, Any]]:
|
|
47
|
+
"""Return { relative_path: models_doc } for every `kind: model` file.
|
|
48
|
+
|
|
49
|
+
We collect all models into a single `models/_schema.yml` so dbt can `dbt parse`
|
|
50
|
+
them in one read. Power users can split later; keeping everything in one file
|
|
51
|
+
is the dbt community's default and prevents discovery surprises.
|
|
52
|
+
"""
|
|
53
|
+
models = [_model_to_dict(m) for m in project.models.values()]
|
|
54
|
+
if not models:
|
|
55
|
+
return {}
|
|
56
|
+
return {"models/_schema.yml": {"version": 2, "models": models}}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ------------------------ writing ------------------------
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class EmitReport:
|
|
64
|
+
files: List[str] = field(default_factory=list)
|
|
65
|
+
sources: int = 0
|
|
66
|
+
models: int = 0
|
|
67
|
+
|
|
68
|
+
def summary(self) -> str:
|
|
69
|
+
lines = ["dbt emission complete:"]
|
|
70
|
+
lines.append(f" source files: {self.sources}")
|
|
71
|
+
lines.append(f" model files: {self.models}")
|
|
72
|
+
for f in self.files:
|
|
73
|
+
lines.append(f" - {f}")
|
|
74
|
+
return "\n".join(lines)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def emit_dbt(
|
|
78
|
+
project: DataLexProject,
|
|
79
|
+
out_dir: str,
|
|
80
|
+
include_sources: bool = True,
|
|
81
|
+
include_models: bool = True,
|
|
82
|
+
) -> EmitReport:
|
|
83
|
+
"""Render a DataLexProject into a dbt-parseable YAML tree under out_dir."""
|
|
84
|
+
report = EmitReport()
|
|
85
|
+
out = Path(out_dir)
|
|
86
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
|
|
88
|
+
if include_sources:
|
|
89
|
+
for rel, payload in build_sources_yaml(project).items():
|
|
90
|
+
target = out / rel
|
|
91
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
_write_yaml(target, payload)
|
|
93
|
+
report.files.append(str(target))
|
|
94
|
+
report.sources += 1
|
|
95
|
+
|
|
96
|
+
if include_models:
|
|
97
|
+
for rel, payload in build_models_yaml(project).items():
|
|
98
|
+
target = out / rel
|
|
99
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
_write_yaml(target, payload)
|
|
101
|
+
report.files.append(str(target))
|
|
102
|
+
report.models += 1
|
|
103
|
+
|
|
104
|
+
return report
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _write_yaml(path: Path, doc: Dict[str, Any]) -> None:
|
|
108
|
+
with path.open("w", encoding="utf-8") as f:
|
|
109
|
+
yaml.safe_dump(
|
|
110
|
+
doc,
|
|
111
|
+
f,
|
|
112
|
+
sort_keys=False,
|
|
113
|
+
default_flow_style=False,
|
|
114
|
+
allow_unicode=True,
|
|
115
|
+
width=120,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ------------------------ translators ------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _source_to_dict(src: Dict[str, Any]) -> Dict[str, Any]:
|
|
123
|
+
doc: Dict[str, Any] = {"name": src["name"]}
|
|
124
|
+
_copy_if_set(doc, src, ("description", "database", "schema", "loader", "loaded_at_field"))
|
|
125
|
+
if src.get("freshness"):
|
|
126
|
+
doc["freshness"] = _translate_freshness(src["freshness"])
|
|
127
|
+
|
|
128
|
+
tables_out: List[Dict[str, Any]] = []
|
|
129
|
+
for tbl in src.get("tables", []) or []:
|
|
130
|
+
tables_out.append(_source_table_to_dict(tbl))
|
|
131
|
+
doc["tables"] = tables_out
|
|
132
|
+
|
|
133
|
+
meta = _build_meta(src, extra={"kind": "source"})
|
|
134
|
+
if meta:
|
|
135
|
+
doc["meta"] = meta
|
|
136
|
+
|
|
137
|
+
return doc
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _source_table_to_dict(tbl: Dict[str, Any]) -> Dict[str, Any]:
|
|
141
|
+
doc: Dict[str, Any] = {"name": tbl["name"]}
|
|
142
|
+
_copy_if_set(doc, tbl, ("description", "identifier", "loaded_at_field"))
|
|
143
|
+
if tbl.get("freshness"):
|
|
144
|
+
doc["freshness"] = _translate_freshness(tbl["freshness"])
|
|
145
|
+
cols_out: List[Dict[str, Any]] = []
|
|
146
|
+
for c in tbl.get("columns", []) or []:
|
|
147
|
+
cols_out.append(_source_column_to_dict(c))
|
|
148
|
+
if cols_out:
|
|
149
|
+
doc["columns"] = cols_out
|
|
150
|
+
meta = _build_meta(tbl)
|
|
151
|
+
if meta:
|
|
152
|
+
doc["meta"] = meta
|
|
153
|
+
return doc
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _source_column_to_dict(col: Dict[str, Any]) -> Dict[str, Any]:
|
|
157
|
+
doc: Dict[str, Any] = {"name": col["name"]}
|
|
158
|
+
_copy_if_set(doc, col, ("description",))
|
|
159
|
+
# sources pass `type` through dbt-side as data_type so contract works downstream
|
|
160
|
+
if col.get("type"):
|
|
161
|
+
doc["data_type"] = col["type"]
|
|
162
|
+
if col.get("tests"):
|
|
163
|
+
doc["tests"] = list(col["tests"])
|
|
164
|
+
meta = _build_meta(col, extra=_sensitivity_meta(col))
|
|
165
|
+
if meta:
|
|
166
|
+
doc["meta"] = meta
|
|
167
|
+
return doc
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _model_to_dict(m: Dict[str, Any]) -> Dict[str, Any]:
|
|
171
|
+
doc: Dict[str, Any] = {"name": m["name"]}
|
|
172
|
+
_copy_if_set(doc, m, ("description",))
|
|
173
|
+
|
|
174
|
+
config = _model_config(m)
|
|
175
|
+
if config:
|
|
176
|
+
doc["config"] = config
|
|
177
|
+
|
|
178
|
+
cols_out: List[Dict[str, Any]] = []
|
|
179
|
+
contract_enforced = bool((m.get("contract") or {}).get("enforced"))
|
|
180
|
+
for c in m.get("columns", []) or []:
|
|
181
|
+
cols_out.append(_model_column_to_dict(c, contract_enforced=contract_enforced))
|
|
182
|
+
if cols_out:
|
|
183
|
+
doc["columns"] = cols_out
|
|
184
|
+
|
|
185
|
+
meta = _build_meta(
|
|
186
|
+
m,
|
|
187
|
+
extra={
|
|
188
|
+
"kind": "model",
|
|
189
|
+
**(
|
|
190
|
+
{"depends_on": [_ref_to_string(r) for r in m["depends_on"]]}
|
|
191
|
+
if m.get("depends_on")
|
|
192
|
+
else {}
|
|
193
|
+
),
|
|
194
|
+
**({"derived_sql": m["derived_sql"]} if m.get("derived_sql") else {}),
|
|
195
|
+
**({"sql_path": m["sql_path"]} if m.get("sql_path") else {}),
|
|
196
|
+
**({"owner": m["owner"]} if m.get("owner") else {}),
|
|
197
|
+
**({"domain": m["domain"]} if m.get("domain") else {}),
|
|
198
|
+
},
|
|
199
|
+
)
|
|
200
|
+
if meta:
|
|
201
|
+
doc["meta"] = meta
|
|
202
|
+
|
|
203
|
+
if m.get("tags"):
|
|
204
|
+
doc.setdefault("config", {})["tags"] = list(m["tags"])
|
|
205
|
+
|
|
206
|
+
return doc
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _model_config(m: Dict[str, Any]) -> Dict[str, Any]:
|
|
210
|
+
cfg: Dict[str, Any] = {}
|
|
211
|
+
if m.get("materialization"):
|
|
212
|
+
cfg["materialized"] = m["materialization"]
|
|
213
|
+
if m.get("database"):
|
|
214
|
+
cfg["database"] = m["database"]
|
|
215
|
+
if m.get("schema"):
|
|
216
|
+
cfg["schema"] = m["schema"]
|
|
217
|
+
contract = m.get("contract") or {}
|
|
218
|
+
if contract.get("enforced") is not None:
|
|
219
|
+
cfg["contract"] = {"enforced": bool(contract["enforced"])}
|
|
220
|
+
return cfg
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _model_column_to_dict(col: Dict[str, Any], contract_enforced: bool) -> Dict[str, Any]:
|
|
224
|
+
doc: Dict[str, Any] = {"name": col["name"]}
|
|
225
|
+
_copy_if_set(doc, col, ("description",))
|
|
226
|
+
|
|
227
|
+
# Contract enforcement requires data_type; always emit it if present.
|
|
228
|
+
if col.get("type"):
|
|
229
|
+
doc["data_type"] = col["type"]
|
|
230
|
+
elif contract_enforced:
|
|
231
|
+
# dbt parse will fail without data_type on contract-enforced models.
|
|
232
|
+
# Surface this as a YAML-visible TODO rather than silently dropping it.
|
|
233
|
+
doc["data_type"] = "UNSPECIFIED"
|
|
234
|
+
|
|
235
|
+
constraints = _translate_constraints(col)
|
|
236
|
+
if constraints:
|
|
237
|
+
doc["constraints"] = constraints
|
|
238
|
+
|
|
239
|
+
if col.get("tests"):
|
|
240
|
+
doc["tests"] = list(col["tests"])
|
|
241
|
+
|
|
242
|
+
meta = _build_meta(col, extra=_sensitivity_meta(col))
|
|
243
|
+
if meta:
|
|
244
|
+
doc["meta"] = meta
|
|
245
|
+
return doc
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _translate_constraints(col: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
249
|
+
"""Convert DataLex column constraint rules to dbt constraint entries.
|
|
250
|
+
|
|
251
|
+
Pulls from both shorthand fields (primary_key / unique / nullable) and the
|
|
252
|
+
explicit `constraints:` array. Deduplicates by (type, expression) so the
|
|
253
|
+
same intent declared both ways doesn't produce duplicate entries.
|
|
254
|
+
"""
|
|
255
|
+
out: List[Dict[str, Any]] = []
|
|
256
|
+
seen: set = set()
|
|
257
|
+
|
|
258
|
+
def _add(entry: Dict[str, Any]) -> None:
|
|
259
|
+
key = (entry.get("type"), entry.get("expression"))
|
|
260
|
+
if key not in seen:
|
|
261
|
+
seen.add(key)
|
|
262
|
+
out.append(entry)
|
|
263
|
+
|
|
264
|
+
if col.get("primary_key"):
|
|
265
|
+
_add({"type": "primary_key"})
|
|
266
|
+
if col.get("unique"):
|
|
267
|
+
_add({"type": "unique"})
|
|
268
|
+
if col.get("nullable") is False and not col.get("primary_key"):
|
|
269
|
+
_add({"type": "not_null"})
|
|
270
|
+
|
|
271
|
+
ref = col.get("references")
|
|
272
|
+
if ref and ref.get("entity") and ref.get("column"):
|
|
273
|
+
_add({"type": "foreign_key", "expression": f"{ref['entity']}({ref['column']})"})
|
|
274
|
+
|
|
275
|
+
for c in col.get("constraints", []) or []:
|
|
276
|
+
ctype = c.get("type")
|
|
277
|
+
if ctype in ("primary_key", "unique", "not_null"):
|
|
278
|
+
_add({"type": ctype})
|
|
279
|
+
elif ctype == "check" and c.get("expression"):
|
|
280
|
+
_add({"type": "check", "expression": c["expression"]})
|
|
281
|
+
elif ctype == "foreign_key" and c.get("expression"):
|
|
282
|
+
_add({"type": "foreign_key", "expression": c["expression"]})
|
|
283
|
+
return out
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _translate_freshness(f: Dict[str, Any]) -> Dict[str, Any]:
|
|
287
|
+
out: Dict[str, Any] = {}
|
|
288
|
+
for k in ("warn_after", "error_after"):
|
|
289
|
+
if f.get(k):
|
|
290
|
+
out[k] = {"count": f[k]["count"], "period": f[k]["period"]}
|
|
291
|
+
if f.get("filter"):
|
|
292
|
+
out["filter"] = f["filter"]
|
|
293
|
+
return out
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _build_meta(
|
|
297
|
+
obj: Dict[str, Any],
|
|
298
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
299
|
+
) -> Dict[str, Any]:
|
|
300
|
+
"""Merge the object's declared `meta` with governance round-trip keys under
|
|
301
|
+
`meta.datalex.*`. User-declared keys win — we never overwrite."""
|
|
302
|
+
out: Dict[str, Any] = {}
|
|
303
|
+
# start with any user-declared meta
|
|
304
|
+
user_meta = obj.get("meta") or {}
|
|
305
|
+
if isinstance(user_meta, dict):
|
|
306
|
+
out.update({k: v for k, v in user_meta.items() if k != "datalex"})
|
|
307
|
+
|
|
308
|
+
datalex_meta: Dict[str, Any] = {}
|
|
309
|
+
# Preserve existing meta.datalex if present (idempotent re-emits).
|
|
310
|
+
if isinstance(user_meta.get("datalex"), dict):
|
|
311
|
+
datalex_meta.update(user_meta["datalex"])
|
|
312
|
+
if extra:
|
|
313
|
+
for k, v in extra.items():
|
|
314
|
+
datalex_meta.setdefault(k, v)
|
|
315
|
+
if datalex_meta:
|
|
316
|
+
out["datalex"] = datalex_meta
|
|
317
|
+
return out
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _sensitivity_meta(obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
321
|
+
extra: Dict[str, Any] = {}
|
|
322
|
+
if obj.get("sensitivity"):
|
|
323
|
+
extra["sensitivity"] = obj["sensitivity"]
|
|
324
|
+
if obj.get("tags"):
|
|
325
|
+
extra["tags"] = list(obj["tags"])
|
|
326
|
+
if obj.get("terms"):
|
|
327
|
+
extra["terms"] = list(obj["terms"])
|
|
328
|
+
return extra
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _copy_if_set(dst: Dict[str, Any], src: Dict[str, Any], keys: Tuple[str, ...]) -> None:
|
|
332
|
+
for k in keys:
|
|
333
|
+
v = src.get(k)
|
|
334
|
+
if v is not None and v != "":
|
|
335
|
+
dst[k] = v
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _ref_to_string(dep: Dict[str, Any]) -> str:
|
|
339
|
+
if "ref" in dep:
|
|
340
|
+
return f"ref:{dep['ref']}"
|
|
341
|
+
if "source" in dep:
|
|
342
|
+
s = dep["source"]
|
|
343
|
+
return f"source:{s.get('source')}.{s.get('name')}"
|
|
344
|
+
return str(dep)
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""dbt manifest.json -> DataLex source/model importer with idempotent round-trip.
|
|
2
|
+
|
|
3
|
+
Design:
|
|
4
|
+
* Stable key = `unique_id` from the manifest (e.g. `source.my_project.raw.orders`,
|
|
5
|
+
`model.my_project.stg_orders`). Stored under `meta.datalex.dbt.unique_id`.
|
|
6
|
+
* On re-import, existing DataLex files are *merged*, not overwritten. User-authored
|
|
7
|
+
fields (description, tests, sensitivity, owner, etc.) are preserved; only fields
|
|
8
|
+
the manifest owns (database/schema/columns' data_type) get refreshed.
|
|
9
|
+
* The importer emits ready-to-write dicts; callers choose where to write them
|
|
10
|
+
(typically under `sources/` and `models/`).
|
|
11
|
+
|
|
12
|
+
What we do NOT do here: write files. A thin wrapper does that — `write_import_result`
|
|
13
|
+
in this module — but users can choose to merge into an existing project tree manually
|
|
14
|
+
via their own logic.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ------------------------ public API ------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ImportResult:
|
|
32
|
+
sources: Dict[str, Dict[str, Any]] = field(default_factory=dict) # name -> doc
|
|
33
|
+
models: Dict[str, Dict[str, Any]] = field(default_factory=dict)
|
|
34
|
+
warnings: List[str] = field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def import_manifest(
|
|
38
|
+
manifest_path: str,
|
|
39
|
+
existing_project_root: Optional[str] = None,
|
|
40
|
+
) -> ImportResult:
|
|
41
|
+
"""Parse a dbt manifest.json and return merged DataLex source/model docs.
|
|
42
|
+
|
|
43
|
+
When `existing_project_root` is provided, documents with matching
|
|
44
|
+
`meta.datalex.dbt.unique_id` are merged (user-authored fields preserved).
|
|
45
|
+
"""
|
|
46
|
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
47
|
+
manifest = json.load(f)
|
|
48
|
+
|
|
49
|
+
existing = _load_existing_by_unique_id(existing_project_root) if existing_project_root else {}
|
|
50
|
+
|
|
51
|
+
result = ImportResult()
|
|
52
|
+
|
|
53
|
+
nodes = manifest.get("nodes") or {}
|
|
54
|
+
sources = manifest.get("sources") or {}
|
|
55
|
+
|
|
56
|
+
# Sources are keyed by source_name in dbt; group per source_name so we emit one file per source.
|
|
57
|
+
sources_grouped: Dict[str, List[Dict[str, Any]]] = {}
|
|
58
|
+
for uid, node in sources.items():
|
|
59
|
+
source_name = node.get("source_name") or node.get("name")
|
|
60
|
+
sources_grouped.setdefault(source_name, []).append(node)
|
|
61
|
+
|
|
62
|
+
for source_name, tables in sources_grouped.items():
|
|
63
|
+
doc = _build_source_doc(source_name, tables, existing)
|
|
64
|
+
result.sources[doc["name"]] = doc
|
|
65
|
+
|
|
66
|
+
for uid, node in nodes.items():
|
|
67
|
+
if node.get("resource_type") != "model":
|
|
68
|
+
continue
|
|
69
|
+
doc = _build_model_doc(node, existing)
|
|
70
|
+
result.models[doc["name"]] = doc
|
|
71
|
+
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def write_import_result(result: ImportResult, out_root: str) -> List[str]:
|
|
76
|
+
"""Persist an ImportResult into a DataLex-style tree under out_root.
|
|
77
|
+
|
|
78
|
+
Writes:
|
|
79
|
+
<out_root>/sources/<name>.yaml
|
|
80
|
+
<out_root>/models/dbt/<name>.yaml
|
|
81
|
+
"""
|
|
82
|
+
out = Path(out_root)
|
|
83
|
+
written: List[str] = []
|
|
84
|
+
|
|
85
|
+
for doc in result.sources.values():
|
|
86
|
+
path = out / "sources" / f"{doc['name']}.yaml"
|
|
87
|
+
_write_yaml(path, doc)
|
|
88
|
+
written.append(str(path))
|
|
89
|
+
|
|
90
|
+
for doc in result.models.values():
|
|
91
|
+
path = out / "models" / "dbt" / f"{doc['name']}.yaml"
|
|
92
|
+
_write_yaml(path, doc)
|
|
93
|
+
written.append(str(path))
|
|
94
|
+
|
|
95
|
+
return written
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ------------------------ builders ------------------------
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _build_source_doc(
|
|
102
|
+
source_name: str,
|
|
103
|
+
tables: List[Dict[str, Any]],
|
|
104
|
+
existing: Dict[str, Dict[str, Any]],
|
|
105
|
+
) -> Dict[str, Any]:
|
|
106
|
+
# Source-level attributes come from the first table (dbt stores them per-node; pick any).
|
|
107
|
+
first = tables[0]
|
|
108
|
+
database = first.get("database")
|
|
109
|
+
schema = first.get("schema")
|
|
110
|
+
|
|
111
|
+
# Look for an existing source doc matching any of these tables' unique_ids so we
|
|
112
|
+
# preserve cross-table user fields (e.g., source-level owner).
|
|
113
|
+
existing_doc: Optional[Dict[str, Any]] = None
|
|
114
|
+
for t in tables:
|
|
115
|
+
uid = t.get("unique_id")
|
|
116
|
+
existing_doc = existing.get(uid) or existing_doc
|
|
117
|
+
|
|
118
|
+
doc: Dict[str, Any] = {
|
|
119
|
+
"kind": "source",
|
|
120
|
+
"name": _safe_name(source_name),
|
|
121
|
+
}
|
|
122
|
+
if existing_doc:
|
|
123
|
+
_merge_preserving_user_fields(doc, existing_doc, keys=("description", "owner", "tags", "loader", "loaded_at_field", "freshness"))
|
|
124
|
+
|
|
125
|
+
if database:
|
|
126
|
+
doc["database"] = database
|
|
127
|
+
if schema:
|
|
128
|
+
doc["schema"] = schema
|
|
129
|
+
|
|
130
|
+
table_docs: List[Dict[str, Any]] = []
|
|
131
|
+
for t in tables:
|
|
132
|
+
table_docs.append(_build_source_table_doc(t, existing_doc))
|
|
133
|
+
doc["tables"] = table_docs
|
|
134
|
+
|
|
135
|
+
# meta.datalex.dbt.unique_id list, so re-import can find this doc even if one table is renamed.
|
|
136
|
+
doc.setdefault("meta", {}).setdefault("datalex", {})["dbt"] = {
|
|
137
|
+
"unique_ids": sorted(t.get("unique_id") for t in tables if t.get("unique_id")),
|
|
138
|
+
}
|
|
139
|
+
return doc
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _build_source_table_doc(
|
|
143
|
+
t: Dict[str, Any],
|
|
144
|
+
existing_source: Optional[Dict[str, Any]],
|
|
145
|
+
) -> Dict[str, Any]:
|
|
146
|
+
name = _safe_name(t.get("name", ""))
|
|
147
|
+
table_doc: Dict[str, Any] = {"name": name}
|
|
148
|
+
|
|
149
|
+
# Locate prior table body if present
|
|
150
|
+
prior_table: Dict[str, Any] = {}
|
|
151
|
+
if existing_source:
|
|
152
|
+
for candidate in existing_source.get("tables", []) or []:
|
|
153
|
+
if candidate.get("name") == name:
|
|
154
|
+
prior_table = candidate
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
if t.get("description"):
|
|
158
|
+
table_doc["description"] = t["description"]
|
|
159
|
+
elif prior_table.get("description"):
|
|
160
|
+
table_doc["description"] = prior_table["description"]
|
|
161
|
+
|
|
162
|
+
if t.get("identifier") and t["identifier"] != name:
|
|
163
|
+
table_doc["identifier"] = t["identifier"]
|
|
164
|
+
if t.get("loaded_at_field"):
|
|
165
|
+
table_doc["loaded_at_field"] = t["loaded_at_field"]
|
|
166
|
+
if t.get("freshness"):
|
|
167
|
+
table_doc["freshness"] = t["freshness"]
|
|
168
|
+
|
|
169
|
+
# columns
|
|
170
|
+
cols_out: List[Dict[str, Any]] = []
|
|
171
|
+
prior_cols = {c.get("name"): c for c in (prior_table.get("columns") or []) if c.get("name")}
|
|
172
|
+
for c in t.get("columns", {}).values() if isinstance(t.get("columns"), dict) else (t.get("columns") or []):
|
|
173
|
+
cols_out.append(_build_source_column_doc(c, prior_cols.get(c.get("name"), {})))
|
|
174
|
+
if cols_out:
|
|
175
|
+
table_doc["columns"] = cols_out
|
|
176
|
+
|
|
177
|
+
# unique_id preserved at table level too
|
|
178
|
+
if t.get("unique_id"):
|
|
179
|
+
table_doc.setdefault("meta", {}).setdefault("datalex", {}).setdefault("dbt", {})[
|
|
180
|
+
"unique_id"
|
|
181
|
+
] = t["unique_id"]
|
|
182
|
+
|
|
183
|
+
return table_doc
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _build_source_column_doc(c: Dict[str, Any], prior: Dict[str, Any]) -> Dict[str, Any]:
|
|
187
|
+
doc: Dict[str, Any] = {"name": c.get("name")}
|
|
188
|
+
# type: manifest owns it; prefer manifest value if present
|
|
189
|
+
if c.get("data_type"):
|
|
190
|
+
doc["type"] = c["data_type"]
|
|
191
|
+
elif prior.get("type"):
|
|
192
|
+
doc["type"] = prior["type"]
|
|
193
|
+
|
|
194
|
+
# user-authored: preserve
|
|
195
|
+
for k in ("description", "sensitivity", "tags"):
|
|
196
|
+
if prior.get(k):
|
|
197
|
+
doc[k] = prior[k]
|
|
198
|
+
# manifest description wins only if user has no override
|
|
199
|
+
if c.get("description") and "description" not in doc:
|
|
200
|
+
doc["description"] = c["description"]
|
|
201
|
+
|
|
202
|
+
return doc
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _build_model_doc(
|
|
206
|
+
node: Dict[str, Any],
|
|
207
|
+
existing: Dict[str, Dict[str, Any]],
|
|
208
|
+
) -> Dict[str, Any]:
|
|
209
|
+
name = _safe_name(node.get("name", ""))
|
|
210
|
+
uid = node.get("unique_id")
|
|
211
|
+
prior = existing.get(uid, {}) if uid else {}
|
|
212
|
+
|
|
213
|
+
doc: Dict[str, Any] = {
|
|
214
|
+
"kind": "model",
|
|
215
|
+
"name": name,
|
|
216
|
+
}
|
|
217
|
+
# user-owned fields preserved
|
|
218
|
+
_merge_preserving_user_fields(
|
|
219
|
+
doc, prior, keys=("description", "owner", "domain", "tags", "materialization", "contract"),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# manifest-owned fields
|
|
223
|
+
config = node.get("config") or {}
|
|
224
|
+
if config.get("materialized") and "materialization" not in doc:
|
|
225
|
+
doc["materialization"] = config["materialized"]
|
|
226
|
+
if node.get("database"):
|
|
227
|
+
doc["database"] = node["database"]
|
|
228
|
+
if node.get("schema"):
|
|
229
|
+
doc["schema"] = node["schema"]
|
|
230
|
+
if node.get("description") and "description" not in doc:
|
|
231
|
+
doc["description"] = node["description"]
|
|
232
|
+
|
|
233
|
+
# depends_on — from manifest; represent both refs and sources
|
|
234
|
+
depends: List[Dict[str, Any]] = []
|
|
235
|
+
for parent_uid in (node.get("depends_on", {}) or {}).get("nodes", []) or []:
|
|
236
|
+
if parent_uid.startswith("model."):
|
|
237
|
+
depends.append({"ref": _safe_name(parent_uid.rsplit(".", 1)[-1])})
|
|
238
|
+
elif parent_uid.startswith("source."):
|
|
239
|
+
parts = parent_uid.split(".")
|
|
240
|
+
if len(parts) >= 4:
|
|
241
|
+
depends.append({"source": {"source": _safe_name(parts[-2]), "name": _safe_name(parts[-1])}})
|
|
242
|
+
if depends:
|
|
243
|
+
doc["depends_on"] = depends
|
|
244
|
+
|
|
245
|
+
# columns
|
|
246
|
+
prior_cols = {c.get("name"): c for c in (prior.get("columns") or []) if c.get("name")}
|
|
247
|
+
cols_out: List[Dict[str, Any]] = []
|
|
248
|
+
columns_raw = node.get("columns") or {}
|
|
249
|
+
column_iter = columns_raw.values() if isinstance(columns_raw, dict) else columns_raw
|
|
250
|
+
for c in column_iter:
|
|
251
|
+
cols_out.append(_build_model_column_doc(c, prior_cols.get(c.get("name"), {})))
|
|
252
|
+
if cols_out:
|
|
253
|
+
doc["columns"] = cols_out
|
|
254
|
+
|
|
255
|
+
if uid:
|
|
256
|
+
doc.setdefault("meta", {}).setdefault("datalex", {})["dbt"] = {"unique_id": uid}
|
|
257
|
+
|
|
258
|
+
return doc
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _build_model_column_doc(c: Dict[str, Any], prior: Dict[str, Any]) -> Dict[str, Any]:
|
|
262
|
+
doc: Dict[str, Any] = {"name": c.get("name")}
|
|
263
|
+
if c.get("data_type"):
|
|
264
|
+
doc["type"] = c["data_type"]
|
|
265
|
+
elif prior.get("type"):
|
|
266
|
+
doc["type"] = prior["type"]
|
|
267
|
+
|
|
268
|
+
for k in ("description", "sensitivity", "tags", "terms", "tests", "constraints"):
|
|
269
|
+
if prior.get(k):
|
|
270
|
+
doc[k] = prior[k]
|
|
271
|
+
if c.get("description") and "description" not in doc:
|
|
272
|
+
doc["description"] = c["description"]
|
|
273
|
+
return doc
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# ------------------------ helpers ------------------------
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _load_existing_by_unique_id(project_root: str) -> Dict[str, Dict[str, Any]]:
|
|
280
|
+
"""Walk the project tree and index every doc by its `meta.datalex.dbt.unique_id(s)`."""
|
|
281
|
+
out: Dict[str, Dict[str, Any]] = {}
|
|
282
|
+
root = Path(project_root)
|
|
283
|
+
if not root.exists():
|
|
284
|
+
return out
|
|
285
|
+
for path in root.rglob("*.yaml"):
|
|
286
|
+
try:
|
|
287
|
+
with path.open("r", encoding="utf-8") as f:
|
|
288
|
+
doc = yaml.safe_load(f)
|
|
289
|
+
except Exception:
|
|
290
|
+
continue
|
|
291
|
+
if not isinstance(doc, dict):
|
|
292
|
+
continue
|
|
293
|
+
meta = (doc.get("meta") or {}).get("datalex") or {}
|
|
294
|
+
dbt_meta = meta.get("dbt") or {}
|
|
295
|
+
uid = dbt_meta.get("unique_id")
|
|
296
|
+
uids = dbt_meta.get("unique_ids") or ([uid] if uid else [])
|
|
297
|
+
for u in uids:
|
|
298
|
+
if u:
|
|
299
|
+
out[u] = doc
|
|
300
|
+
return out
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _merge_preserving_user_fields(
|
|
304
|
+
dst: Dict[str, Any],
|
|
305
|
+
src: Dict[str, Any],
|
|
306
|
+
keys: tuple,
|
|
307
|
+
) -> None:
|
|
308
|
+
for k in keys:
|
|
309
|
+
if src.get(k) not in (None, "", [], {}):
|
|
310
|
+
dst[k] = src[k]
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _safe_name(name: str) -> str:
|
|
314
|
+
"""DataLex names must match ^[a-z][a-z0-9_]*$ — coerce dbt names that drift."""
|
|
315
|
+
import re
|
|
316
|
+
|
|
317
|
+
if not name:
|
|
318
|
+
return "unnamed"
|
|
319
|
+
s = name.strip().lower()
|
|
320
|
+
s = re.sub(r"[^a-z0-9_]", "_", s)
|
|
321
|
+
if not re.match(r"^[a-z]", s):
|
|
322
|
+
s = "n_" + s
|
|
323
|
+
return s
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _write_yaml(path: Path, doc: Dict[str, Any]) -> None:
|
|
327
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
with path.open("w", encoding="utf-8") as f:
|
|
329
|
+
yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False, allow_unicode=True)
|