datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,344 @@
1
+ """dbt YAML emitter.
2
+
3
+ Given a loaded DataLexProject, emits:
4
+ * sources/<source_name>.yml — one file per `kind: source`
5
+ * models/_schema.yml — schema.yml for every `kind: model`
6
+
7
+ Output is dbt v2 format and includes:
8
+ * contracts (`config.contract.enforced: true` with `data_type` per column)
9
+ * column-level constraints (primary_key / unique / not_null / foreign_key / check)
10
+ * tests (unique / not_null / accepted_values / relationships / custom)
11
+ * freshness (at source level and per-table)
12
+ * meta round-trip via `meta.datalex.*` so reimports never clobber user intent
13
+
14
+ The dict payloads returned by `build_sources_yaml` / `build_models_yaml` are plain,
15
+ serialization-ready dicts — callers choose how to write them (single file, per-file,
16
+ etc.) via `write_dbt_yaml`.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import dataclass, field
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import yaml
26
+
27
+ from datalex_core.datalex.project import DataLexProject
28
+
29
+
30
+ # ------------------------ build payloads ------------------------
31
+
32
+
33
+ def build_sources_yaml(project: DataLexProject) -> Dict[str, Dict[str, Any]]:
34
+ """Return { relative_path: source_doc } for every `kind: source` file.
35
+
36
+ We split by source name so dbt's `source(name, table)` reference stays stable
37
+ and edit-friendly: sources/<name>.yml.
38
+ """
39
+ out: Dict[str, Dict[str, Any]] = {}
40
+ for src in project.sources.values():
41
+ doc = _source_to_dict(src)
42
+ out[f"sources/{src['name']}.yml"] = {"version": 2, "sources": [doc]}
43
+ return out
44
+
45
+
46
+ def build_models_yaml(project: DataLexProject) -> Dict[str, Dict[str, Any]]:
47
+ """Return { relative_path: models_doc } for every `kind: model` file.
48
+
49
+ We collect all models into a single `models/_schema.yml` so dbt can `dbt parse`
50
+ them in one read. Power users can split later; keeping everything in one file
51
+ is the dbt community's default and prevents discovery surprises.
52
+ """
53
+ models = [_model_to_dict(m) for m in project.models.values()]
54
+ if not models:
55
+ return {}
56
+ return {"models/_schema.yml": {"version": 2, "models": models}}
57
+
58
+
59
+ # ------------------------ writing ------------------------
60
+
61
+
62
+ @dataclass
63
+ class EmitReport:
64
+ files: List[str] = field(default_factory=list)
65
+ sources: int = 0
66
+ models: int = 0
67
+
68
+ def summary(self) -> str:
69
+ lines = ["dbt emission complete:"]
70
+ lines.append(f" source files: {self.sources}")
71
+ lines.append(f" model files: {self.models}")
72
+ for f in self.files:
73
+ lines.append(f" - {f}")
74
+ return "\n".join(lines)
75
+
76
+
77
+ def emit_dbt(
78
+ project: DataLexProject,
79
+ out_dir: str,
80
+ include_sources: bool = True,
81
+ include_models: bool = True,
82
+ ) -> EmitReport:
83
+ """Render a DataLexProject into a dbt-parseable YAML tree under out_dir."""
84
+ report = EmitReport()
85
+ out = Path(out_dir)
86
+ out.mkdir(parents=True, exist_ok=True)
87
+
88
+ if include_sources:
89
+ for rel, payload in build_sources_yaml(project).items():
90
+ target = out / rel
91
+ target.parent.mkdir(parents=True, exist_ok=True)
92
+ _write_yaml(target, payload)
93
+ report.files.append(str(target))
94
+ report.sources += 1
95
+
96
+ if include_models:
97
+ for rel, payload in build_models_yaml(project).items():
98
+ target = out / rel
99
+ target.parent.mkdir(parents=True, exist_ok=True)
100
+ _write_yaml(target, payload)
101
+ report.files.append(str(target))
102
+ report.models += 1
103
+
104
+ return report
105
+
106
+
107
+ def _write_yaml(path: Path, doc: Dict[str, Any]) -> None:
108
+ with path.open("w", encoding="utf-8") as f:
109
+ yaml.safe_dump(
110
+ doc,
111
+ f,
112
+ sort_keys=False,
113
+ default_flow_style=False,
114
+ allow_unicode=True,
115
+ width=120,
116
+ )
117
+
118
+
119
+ # ------------------------ translators ------------------------
120
+
121
+
122
+ def _source_to_dict(src: Dict[str, Any]) -> Dict[str, Any]:
123
+ doc: Dict[str, Any] = {"name": src["name"]}
124
+ _copy_if_set(doc, src, ("description", "database", "schema", "loader", "loaded_at_field"))
125
+ if src.get("freshness"):
126
+ doc["freshness"] = _translate_freshness(src["freshness"])
127
+
128
+ tables_out: List[Dict[str, Any]] = []
129
+ for tbl in src.get("tables", []) or []:
130
+ tables_out.append(_source_table_to_dict(tbl))
131
+ doc["tables"] = tables_out
132
+
133
+ meta = _build_meta(src, extra={"kind": "source"})
134
+ if meta:
135
+ doc["meta"] = meta
136
+
137
+ return doc
138
+
139
+
140
+ def _source_table_to_dict(tbl: Dict[str, Any]) -> Dict[str, Any]:
141
+ doc: Dict[str, Any] = {"name": tbl["name"]}
142
+ _copy_if_set(doc, tbl, ("description", "identifier", "loaded_at_field"))
143
+ if tbl.get("freshness"):
144
+ doc["freshness"] = _translate_freshness(tbl["freshness"])
145
+ cols_out: List[Dict[str, Any]] = []
146
+ for c in tbl.get("columns", []) or []:
147
+ cols_out.append(_source_column_to_dict(c))
148
+ if cols_out:
149
+ doc["columns"] = cols_out
150
+ meta = _build_meta(tbl)
151
+ if meta:
152
+ doc["meta"] = meta
153
+ return doc
154
+
155
+
156
+ def _source_column_to_dict(col: Dict[str, Any]) -> Dict[str, Any]:
157
+ doc: Dict[str, Any] = {"name": col["name"]}
158
+ _copy_if_set(doc, col, ("description",))
159
+ # sources pass `type` through dbt-side as data_type so contract works downstream
160
+ if col.get("type"):
161
+ doc["data_type"] = col["type"]
162
+ if col.get("tests"):
163
+ doc["tests"] = list(col["tests"])
164
+ meta = _build_meta(col, extra=_sensitivity_meta(col))
165
+ if meta:
166
+ doc["meta"] = meta
167
+ return doc
168
+
169
+
170
+ def _model_to_dict(m: Dict[str, Any]) -> Dict[str, Any]:
171
+ doc: Dict[str, Any] = {"name": m["name"]}
172
+ _copy_if_set(doc, m, ("description",))
173
+
174
+ config = _model_config(m)
175
+ if config:
176
+ doc["config"] = config
177
+
178
+ cols_out: List[Dict[str, Any]] = []
179
+ contract_enforced = bool((m.get("contract") or {}).get("enforced"))
180
+ for c in m.get("columns", []) or []:
181
+ cols_out.append(_model_column_to_dict(c, contract_enforced=contract_enforced))
182
+ if cols_out:
183
+ doc["columns"] = cols_out
184
+
185
+ meta = _build_meta(
186
+ m,
187
+ extra={
188
+ "kind": "model",
189
+ **(
190
+ {"depends_on": [_ref_to_string(r) for r in m["depends_on"]]}
191
+ if m.get("depends_on")
192
+ else {}
193
+ ),
194
+ **({"derived_sql": m["derived_sql"]} if m.get("derived_sql") else {}),
195
+ **({"sql_path": m["sql_path"]} if m.get("sql_path") else {}),
196
+ **({"owner": m["owner"]} if m.get("owner") else {}),
197
+ **({"domain": m["domain"]} if m.get("domain") else {}),
198
+ },
199
+ )
200
+ if meta:
201
+ doc["meta"] = meta
202
+
203
+ if m.get("tags"):
204
+ doc.setdefault("config", {})["tags"] = list(m["tags"])
205
+
206
+ return doc
207
+
208
+
209
+ def _model_config(m: Dict[str, Any]) -> Dict[str, Any]:
210
+ cfg: Dict[str, Any] = {}
211
+ if m.get("materialization"):
212
+ cfg["materialized"] = m["materialization"]
213
+ if m.get("database"):
214
+ cfg["database"] = m["database"]
215
+ if m.get("schema"):
216
+ cfg["schema"] = m["schema"]
217
+ contract = m.get("contract") or {}
218
+ if contract.get("enforced") is not None:
219
+ cfg["contract"] = {"enforced": bool(contract["enforced"])}
220
+ return cfg
221
+
222
+
223
+ def _model_column_to_dict(col: Dict[str, Any], contract_enforced: bool) -> Dict[str, Any]:
224
+ doc: Dict[str, Any] = {"name": col["name"]}
225
+ _copy_if_set(doc, col, ("description",))
226
+
227
+ # Contract enforcement requires data_type; always emit it if present.
228
+ if col.get("type"):
229
+ doc["data_type"] = col["type"]
230
+ elif contract_enforced:
231
+ # dbt parse will fail without data_type on contract-enforced models.
232
+ # Surface this as a YAML-visible TODO rather than silently dropping it.
233
+ doc["data_type"] = "UNSPECIFIED"
234
+
235
+ constraints = _translate_constraints(col)
236
+ if constraints:
237
+ doc["constraints"] = constraints
238
+
239
+ if col.get("tests"):
240
+ doc["tests"] = list(col["tests"])
241
+
242
+ meta = _build_meta(col, extra=_sensitivity_meta(col))
243
+ if meta:
244
+ doc["meta"] = meta
245
+ return doc
246
+
247
+
248
+ def _translate_constraints(col: Dict[str, Any]) -> List[Dict[str, Any]]:
249
+ """Convert DataLex column constraint rules to dbt constraint entries.
250
+
251
+ Pulls from both shorthand fields (primary_key / unique / nullable) and the
252
+ explicit `constraints:` array. Deduplicates by (type, expression) so the
253
+ same intent declared both ways doesn't produce duplicate entries.
254
+ """
255
+ out: List[Dict[str, Any]] = []
256
+ seen: set = set()
257
+
258
+ def _add(entry: Dict[str, Any]) -> None:
259
+ key = (entry.get("type"), entry.get("expression"))
260
+ if key not in seen:
261
+ seen.add(key)
262
+ out.append(entry)
263
+
264
+ if col.get("primary_key"):
265
+ _add({"type": "primary_key"})
266
+ if col.get("unique"):
267
+ _add({"type": "unique"})
268
+ if col.get("nullable") is False and not col.get("primary_key"):
269
+ _add({"type": "not_null"})
270
+
271
+ ref = col.get("references")
272
+ if ref and ref.get("entity") and ref.get("column"):
273
+ _add({"type": "foreign_key", "expression": f"{ref['entity']}({ref['column']})"})
274
+
275
+ for c in col.get("constraints", []) or []:
276
+ ctype = c.get("type")
277
+ if ctype in ("primary_key", "unique", "not_null"):
278
+ _add({"type": ctype})
279
+ elif ctype == "check" and c.get("expression"):
280
+ _add({"type": "check", "expression": c["expression"]})
281
+ elif ctype == "foreign_key" and c.get("expression"):
282
+ _add({"type": "foreign_key", "expression": c["expression"]})
283
+ return out
284
+
285
+
286
+ def _translate_freshness(f: Dict[str, Any]) -> Dict[str, Any]:
287
+ out: Dict[str, Any] = {}
288
+ for k in ("warn_after", "error_after"):
289
+ if f.get(k):
290
+ out[k] = {"count": f[k]["count"], "period": f[k]["period"]}
291
+ if f.get("filter"):
292
+ out["filter"] = f["filter"]
293
+ return out
294
+
295
+
296
+ def _build_meta(
297
+ obj: Dict[str, Any],
298
+ extra: Optional[Dict[str, Any]] = None,
299
+ ) -> Dict[str, Any]:
300
+ """Merge the object's declared `meta` with governance round-trip keys under
301
+ `meta.datalex.*`. User-declared keys win — we never overwrite."""
302
+ out: Dict[str, Any] = {}
303
+ # start with any user-declared meta
304
+ user_meta = obj.get("meta") or {}
305
+ if isinstance(user_meta, dict):
306
+ out.update({k: v for k, v in user_meta.items() if k != "datalex"})
307
+
308
+ datalex_meta: Dict[str, Any] = {}
309
+ # Preserve existing meta.datalex if present (idempotent re-emits).
310
+ if isinstance(user_meta.get("datalex"), dict):
311
+ datalex_meta.update(user_meta["datalex"])
312
+ if extra:
313
+ for k, v in extra.items():
314
+ datalex_meta.setdefault(k, v)
315
+ if datalex_meta:
316
+ out["datalex"] = datalex_meta
317
+ return out
318
+
319
+
320
+ def _sensitivity_meta(obj: Dict[str, Any]) -> Dict[str, Any]:
321
+ extra: Dict[str, Any] = {}
322
+ if obj.get("sensitivity"):
323
+ extra["sensitivity"] = obj["sensitivity"]
324
+ if obj.get("tags"):
325
+ extra["tags"] = list(obj["tags"])
326
+ if obj.get("terms"):
327
+ extra["terms"] = list(obj["terms"])
328
+ return extra
329
+
330
+
331
+ def _copy_if_set(dst: Dict[str, Any], src: Dict[str, Any], keys: Tuple[str, ...]) -> None:
332
+ for k in keys:
333
+ v = src.get(k)
334
+ if v is not None and v != "":
335
+ dst[k] = v
336
+
337
+
338
+ def _ref_to_string(dep: Dict[str, Any]) -> str:
339
+ if "ref" in dep:
340
+ return f"ref:{dep['ref']}"
341
+ if "source" in dep:
342
+ s = dep["source"]
343
+ return f"source:{s.get('source')}.{s.get('name')}"
344
+ return str(dep)
@@ -0,0 +1,329 @@
1
+ """dbt manifest.json -> DataLex source/model importer with idempotent round-trip.
2
+
3
+ Design:
4
+ * Stable key = `unique_id` from the manifest (e.g. `source.my_project.raw.orders`,
5
+ `model.my_project.stg_orders`). Stored under `meta.datalex.dbt.unique_id`.
6
+ * On re-import, existing DataLex files are *merged*, not overwritten. User-authored
7
+ fields (description, tests, sensitivity, owner, etc.) are preserved; only fields
8
+ the manifest owns (database/schema/columns' data_type) get refreshed.
9
+ * The importer emits ready-to-write dicts; callers choose where to write them
10
+ (typically under `sources/` and `models/`).
11
+
12
+ What we do NOT do here: write files. A thin wrapper does that — `write_import_result`
13
+ in this module — but users can choose to merge into an existing project tree manually
14
+ via their own logic.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ import yaml
25
+
26
+
27
+ # ------------------------ public API ------------------------
28
+
29
+
30
+ @dataclass
31
+ class ImportResult:
32
+ sources: Dict[str, Dict[str, Any]] = field(default_factory=dict) # name -> doc
33
+ models: Dict[str, Dict[str, Any]] = field(default_factory=dict)
34
+ warnings: List[str] = field(default_factory=list)
35
+
36
+
37
+ def import_manifest(
38
+ manifest_path: str,
39
+ existing_project_root: Optional[str] = None,
40
+ ) -> ImportResult:
41
+ """Parse a dbt manifest.json and return merged DataLex source/model docs.
42
+
43
+ When `existing_project_root` is provided, documents with matching
44
+ `meta.datalex.dbt.unique_id` are merged (user-authored fields preserved).
45
+ """
46
+ with open(manifest_path, "r", encoding="utf-8") as f:
47
+ manifest = json.load(f)
48
+
49
+ existing = _load_existing_by_unique_id(existing_project_root) if existing_project_root else {}
50
+
51
+ result = ImportResult()
52
+
53
+ nodes = manifest.get("nodes") or {}
54
+ sources = manifest.get("sources") or {}
55
+
56
+ # Sources are keyed by source_name in dbt; group per source_name so we emit one file per source.
57
+ sources_grouped: Dict[str, List[Dict[str, Any]]] = {}
58
+ for uid, node in sources.items():
59
+ source_name = node.get("source_name") or node.get("name")
60
+ sources_grouped.setdefault(source_name, []).append(node)
61
+
62
+ for source_name, tables in sources_grouped.items():
63
+ doc = _build_source_doc(source_name, tables, existing)
64
+ result.sources[doc["name"]] = doc
65
+
66
+ for uid, node in nodes.items():
67
+ if node.get("resource_type") != "model":
68
+ continue
69
+ doc = _build_model_doc(node, existing)
70
+ result.models[doc["name"]] = doc
71
+
72
+ return result
73
+
74
+
75
+ def write_import_result(result: ImportResult, out_root: str) -> List[str]:
76
+ """Persist an ImportResult into a DataLex-style tree under out_root.
77
+
78
+ Writes:
79
+ <out_root>/sources/<name>.yaml
80
+ <out_root>/models/dbt/<name>.yaml
81
+ """
82
+ out = Path(out_root)
83
+ written: List[str] = []
84
+
85
+ for doc in result.sources.values():
86
+ path = out / "sources" / f"{doc['name']}.yaml"
87
+ _write_yaml(path, doc)
88
+ written.append(str(path))
89
+
90
+ for doc in result.models.values():
91
+ path = out / "models" / "dbt" / f"{doc['name']}.yaml"
92
+ _write_yaml(path, doc)
93
+ written.append(str(path))
94
+
95
+ return written
96
+
97
+
98
+ # ------------------------ builders ------------------------
99
+
100
+
101
+ def _build_source_doc(
102
+ source_name: str,
103
+ tables: List[Dict[str, Any]],
104
+ existing: Dict[str, Dict[str, Any]],
105
+ ) -> Dict[str, Any]:
106
+ # Source-level attributes come from the first table (dbt stores them per-node; pick any).
107
+ first = tables[0]
108
+ database = first.get("database")
109
+ schema = first.get("schema")
110
+
111
+ # Look for an existing source doc matching any of these tables' unique_ids so we
112
+ # preserve cross-table user fields (e.g., source-level owner).
113
+ existing_doc: Optional[Dict[str, Any]] = None
114
+ for t in tables:
115
+ uid = t.get("unique_id")
116
+ existing_doc = existing.get(uid) or existing_doc
117
+
118
+ doc: Dict[str, Any] = {
119
+ "kind": "source",
120
+ "name": _safe_name(source_name),
121
+ }
122
+ if existing_doc:
123
+ _merge_preserving_user_fields(doc, existing_doc, keys=("description", "owner", "tags", "loader", "loaded_at_field", "freshness"))
124
+
125
+ if database:
126
+ doc["database"] = database
127
+ if schema:
128
+ doc["schema"] = schema
129
+
130
+ table_docs: List[Dict[str, Any]] = []
131
+ for t in tables:
132
+ table_docs.append(_build_source_table_doc(t, existing_doc))
133
+ doc["tables"] = table_docs
134
+
135
+ # meta.datalex.dbt.unique_id list, so re-import can find this doc even if one table is renamed.
136
+ doc.setdefault("meta", {}).setdefault("datalex", {})["dbt"] = {
137
+ "unique_ids": sorted(t.get("unique_id") for t in tables if t.get("unique_id")),
138
+ }
139
+ return doc
140
+
141
+
142
+ def _build_source_table_doc(
143
+ t: Dict[str, Any],
144
+ existing_source: Optional[Dict[str, Any]],
145
+ ) -> Dict[str, Any]:
146
+ name = _safe_name(t.get("name", ""))
147
+ table_doc: Dict[str, Any] = {"name": name}
148
+
149
+ # Locate prior table body if present
150
+ prior_table: Dict[str, Any] = {}
151
+ if existing_source:
152
+ for candidate in existing_source.get("tables", []) or []:
153
+ if candidate.get("name") == name:
154
+ prior_table = candidate
155
+ break
156
+
157
+ if t.get("description"):
158
+ table_doc["description"] = t["description"]
159
+ elif prior_table.get("description"):
160
+ table_doc["description"] = prior_table["description"]
161
+
162
+ if t.get("identifier") and t["identifier"] != name:
163
+ table_doc["identifier"] = t["identifier"]
164
+ if t.get("loaded_at_field"):
165
+ table_doc["loaded_at_field"] = t["loaded_at_field"]
166
+ if t.get("freshness"):
167
+ table_doc["freshness"] = t["freshness"]
168
+
169
+ # columns
170
+ cols_out: List[Dict[str, Any]] = []
171
+ prior_cols = {c.get("name"): c for c in (prior_table.get("columns") or []) if c.get("name")}
172
+ for c in t.get("columns", {}).values() if isinstance(t.get("columns"), dict) else (t.get("columns") or []):
173
+ cols_out.append(_build_source_column_doc(c, prior_cols.get(c.get("name"), {})))
174
+ if cols_out:
175
+ table_doc["columns"] = cols_out
176
+
177
+ # unique_id preserved at table level too
178
+ if t.get("unique_id"):
179
+ table_doc.setdefault("meta", {}).setdefault("datalex", {}).setdefault("dbt", {})[
180
+ "unique_id"
181
+ ] = t["unique_id"]
182
+
183
+ return table_doc
184
+
185
+
186
+ def _build_source_column_doc(c: Dict[str, Any], prior: Dict[str, Any]) -> Dict[str, Any]:
187
+ doc: Dict[str, Any] = {"name": c.get("name")}
188
+ # type: manifest owns it; prefer manifest value if present
189
+ if c.get("data_type"):
190
+ doc["type"] = c["data_type"]
191
+ elif prior.get("type"):
192
+ doc["type"] = prior["type"]
193
+
194
+ # user-authored: preserve
195
+ for k in ("description", "sensitivity", "tags"):
196
+ if prior.get(k):
197
+ doc[k] = prior[k]
198
+ # manifest description wins only if user has no override
199
+ if c.get("description") and "description" not in doc:
200
+ doc["description"] = c["description"]
201
+
202
+ return doc
203
+
204
+
205
+ def _build_model_doc(
206
+ node: Dict[str, Any],
207
+ existing: Dict[str, Dict[str, Any]],
208
+ ) -> Dict[str, Any]:
209
+ name = _safe_name(node.get("name", ""))
210
+ uid = node.get("unique_id")
211
+ prior = existing.get(uid, {}) if uid else {}
212
+
213
+ doc: Dict[str, Any] = {
214
+ "kind": "model",
215
+ "name": name,
216
+ }
217
+ # user-owned fields preserved
218
+ _merge_preserving_user_fields(
219
+ doc, prior, keys=("description", "owner", "domain", "tags", "materialization", "contract"),
220
+ )
221
+
222
+ # manifest-owned fields
223
+ config = node.get("config") or {}
224
+ if config.get("materialized") and "materialization" not in doc:
225
+ doc["materialization"] = config["materialized"]
226
+ if node.get("database"):
227
+ doc["database"] = node["database"]
228
+ if node.get("schema"):
229
+ doc["schema"] = node["schema"]
230
+ if node.get("description") and "description" not in doc:
231
+ doc["description"] = node["description"]
232
+
233
+ # depends_on — from manifest; represent both refs and sources
234
+ depends: List[Dict[str, Any]] = []
235
+ for parent_uid in (node.get("depends_on", {}) or {}).get("nodes", []) or []:
236
+ if parent_uid.startswith("model."):
237
+ depends.append({"ref": _safe_name(parent_uid.rsplit(".", 1)[-1])})
238
+ elif parent_uid.startswith("source."):
239
+ parts = parent_uid.split(".")
240
+ if len(parts) >= 4:
241
+ depends.append({"source": {"source": _safe_name(parts[-2]), "name": _safe_name(parts[-1])}})
242
+ if depends:
243
+ doc["depends_on"] = depends
244
+
245
+ # columns
246
+ prior_cols = {c.get("name"): c for c in (prior.get("columns") or []) if c.get("name")}
247
+ cols_out: List[Dict[str, Any]] = []
248
+ columns_raw = node.get("columns") or {}
249
+ column_iter = columns_raw.values() if isinstance(columns_raw, dict) else columns_raw
250
+ for c in column_iter:
251
+ cols_out.append(_build_model_column_doc(c, prior_cols.get(c.get("name"), {})))
252
+ if cols_out:
253
+ doc["columns"] = cols_out
254
+
255
+ if uid:
256
+ doc.setdefault("meta", {}).setdefault("datalex", {})["dbt"] = {"unique_id": uid}
257
+
258
+ return doc
259
+
260
+
261
+ def _build_model_column_doc(c: Dict[str, Any], prior: Dict[str, Any]) -> Dict[str, Any]:
262
+ doc: Dict[str, Any] = {"name": c.get("name")}
263
+ if c.get("data_type"):
264
+ doc["type"] = c["data_type"]
265
+ elif prior.get("type"):
266
+ doc["type"] = prior["type"]
267
+
268
+ for k in ("description", "sensitivity", "tags", "terms", "tests", "constraints"):
269
+ if prior.get(k):
270
+ doc[k] = prior[k]
271
+ if c.get("description") and "description" not in doc:
272
+ doc["description"] = c["description"]
273
+ return doc
274
+
275
+
276
+ # ------------------------ helpers ------------------------
277
+
278
+
279
+ def _load_existing_by_unique_id(project_root: str) -> Dict[str, Dict[str, Any]]:
280
+ """Walk the project tree and index every doc by its `meta.datalex.dbt.unique_id(s)`."""
281
+ out: Dict[str, Dict[str, Any]] = {}
282
+ root = Path(project_root)
283
+ if not root.exists():
284
+ return out
285
+ for path in root.rglob("*.yaml"):
286
+ try:
287
+ with path.open("r", encoding="utf-8") as f:
288
+ doc = yaml.safe_load(f)
289
+ except Exception:
290
+ continue
291
+ if not isinstance(doc, dict):
292
+ continue
293
+ meta = (doc.get("meta") or {}).get("datalex") or {}
294
+ dbt_meta = meta.get("dbt") or {}
295
+ uid = dbt_meta.get("unique_id")
296
+ uids = dbt_meta.get("unique_ids") or ([uid] if uid else [])
297
+ for u in uids:
298
+ if u:
299
+ out[u] = doc
300
+ return out
301
+
302
+
303
+ def _merge_preserving_user_fields(
304
+ dst: Dict[str, Any],
305
+ src: Dict[str, Any],
306
+ keys: tuple,
307
+ ) -> None:
308
+ for k in keys:
309
+ if src.get(k) not in (None, "", [], {}):
310
+ dst[k] = src[k]
311
+
312
+
313
+ def _safe_name(name: str) -> str:
314
+ """DataLex names must match ^[a-z][a-z0-9_]*$ — coerce dbt names that drift."""
315
+ import re
316
+
317
+ if not name:
318
+ return "unnamed"
319
+ s = name.strip().lower()
320
+ s = re.sub(r"[^a-z0-9_]", "_", s)
321
+ if not re.match(r"^[a-z]", s):
322
+ s = "n_" + s
323
+ return s
324
+
325
+
326
+ def _write_yaml(path: Path, doc: Dict[str, Any]) -> None:
327
+ path.parent.mkdir(parents=True, exist_ok=True)
328
+ with path.open("w", encoding="utf-8") as f:
329
+ yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False, allow_unicode=True)