datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,382 @@
1
+ """One-shot migrator: v3 single-model YAML → DataLex file-per-entity layout.
2
+
3
+ Translates a DataLex v3 `*.model.yaml` file (the current "one big
4
+ model" shape) into the DataLex spec layout:
5
+
6
+ datalex.yaml # project manifest (created if missing)
7
+ glossary/<term>.yaml # one file per glossary term
8
+ models/physical/<dialect>/ # one file per entity, layered by physical
9
+ <entity_name>.yaml
10
+
11
+ Rules applied during translation:
12
+ * Entity names are lowered to snake_case; the original PascalCase name is
13
+ preserved in `physical_name:` so DDL round-trips exactly.
14
+ * v3 `fields[]` -> DataLex `columns[]`.
15
+ * v3 top-level `relationships[]` are translated into per-column
16
+ `references:` on the child side (DataLex canonical form). The child side
17
+ is inferred from the cardinality arrow.
18
+ * v3 top-level `indexes[]` are attached to their owning entity.
19
+ * v3 `glossary[]` is split into one file per term under `glossary/`.
20
+ * Governance classification (PII/PHI/etc.) is attached as column
21
+ `sensitivity:` where a column name matches a classified field; entity-level
22
+ classifications are preserved under `meta.datalex.classification`.
23
+
24
+ The migrator is non-destructive: it writes the new tree alongside the existing
25
+ v3 files. The user can commit both, verify equivalence, then delete the v3 copy.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import re
31
+ from dataclasses import dataclass, field
32
+ from pathlib import Path
33
+ from typing import Any, Dict, List, Optional, Tuple
34
+
35
+ import yaml
36
+
37
+ from datalex_core.loader import load_yaml_model
38
+
39
+
40
+ @dataclass
41
+ class MigrationReport:
42
+ project_root: Path
43
+ manifest_written: bool
44
+ entities_written: int
45
+ terms_written: int
46
+ domains_written: int
47
+ warnings: List[str] = field(default_factory=list)
48
+ files: List[str] = field(default_factory=list)
49
+
50
+ def summary(self) -> str:
51
+ out = [
52
+ f"DataLex migration complete:",
53
+ f" project root: {self.project_root}",
54
+ f" manifest: {'created' if self.manifest_written else 'unchanged'}",
55
+ f" entity files: {self.entities_written}",
56
+ f" glossary files: {self.terms_written}",
57
+ f" domain files: {self.domains_written}",
58
+ ]
59
+ if self.warnings:
60
+ out.append(f" warnings: {len(self.warnings)}")
61
+ for w in self.warnings:
62
+ out.append(f" - {w}")
63
+ return "\n".join(out)
64
+
65
+
66
+ def migrate_project(
67
+ v3_model_path: str,
68
+ output_root: Optional[str] = None,
69
+ default_dialect: str = "postgres",
70
+ dry_run: bool = False,
71
+ ) -> MigrationReport:
72
+ """Migrate a single v3 `*.model.yaml` file to a DataLex project tree.
73
+
74
+ output_root — where to write the new tree. Defaults to the directory
75
+ containing v3_model_path.
76
+ default_dialect — which dialect the physical layer is assumed to target.
77
+ Recorded on each entity and in datalex.yaml.
78
+ dry_run — compute the migration plan and return file paths without
79
+ writing.
80
+ """
81
+ src = Path(v3_model_path).resolve()
82
+ root = Path(output_root).resolve() if output_root else src.parent
83
+
84
+ v3 = load_yaml_model(str(src))
85
+ if "model" not in v3 or "entities" not in v3:
86
+ raise ValueError(
87
+ f"{src} does not look like a v3 model file (missing 'model' or 'entities' top-level key)"
88
+ )
89
+
90
+ report = MigrationReport(
91
+ project_root=root,
92
+ manifest_written=False,
93
+ entities_written=0,
94
+ terms_written=0,
95
+ domains_written=0,
96
+ )
97
+
98
+ rel_by_child: Dict[Tuple[str, str], Dict[str, Any]] = _index_relationships_by_child(
99
+ v3.get("relationships", []) or [],
100
+ v3.get("entities", []) or [],
101
+ )
102
+
103
+ governance = (v3.get("governance") or {}).get("classification") or {}
104
+ domains_list = v3.get("domains", []) or []
105
+ terms_list = v3.get("glossary", []) or []
106
+ entities = v3.get("entities", []) or []
107
+ indexes = v3.get("indexes", []) or []
108
+
109
+ # Write manifest only if one does not already exist.
110
+ manifest_path = root / "datalex.yaml"
111
+ manifest_doc = {
112
+ "kind": "project",
113
+ "name": v3["model"]["name"],
114
+ "version": str(v3["model"].get("version", "1")),
115
+ "description": v3["model"].get("description", ""),
116
+ "dialects": [default_dialect],
117
+ "default_dialect": default_dialect,
118
+ "glossary": "glossary/**/*.yaml",
119
+ "models": "models/**/*.yaml",
120
+ "snippets": ".datalex/snippets/**/*.yaml",
121
+ }
122
+
123
+ if not manifest_path.exists():
124
+ _write_yaml(manifest_path, manifest_doc, dry_run=dry_run, report=report)
125
+ report.manifest_written = True
126
+ else:
127
+ report.warnings.append(f"{manifest_path} exists; left untouched.")
128
+
129
+ # Glossary
130
+ for term in terms_list:
131
+ name = _snake(term.get("term") or term.get("name") or "")
132
+ if not name:
133
+ continue
134
+ doc = {
135
+ "kind": "term",
136
+ "name": name,
137
+ "definition": term.get("definition", ""),
138
+ }
139
+ if term.get("owner"):
140
+ doc["steward"] = term["owner"]
141
+ if term.get("abbreviation"):
142
+ doc["abbreviation"] = term["abbreviation"]
143
+ if term.get("tags"):
144
+ doc["tags"] = [str(t) for t in term["tags"]]
145
+ path = root / "glossary" / f"{name}.yaml"
146
+ _write_yaml(path, doc, dry_run=dry_run, report=report)
147
+ report.terms_written += 1
148
+
149
+ # Domains
150
+ for dom in domains_list:
151
+ name = _snake(dom.get("name") or "")
152
+ if not name:
153
+ continue
154
+ doc = {
155
+ "kind": "domain",
156
+ "name": name,
157
+ "description": dom.get("description", ""),
158
+ }
159
+ path = root / "models" / "domains" / f"{name}.yaml"
160
+ _write_yaml(path, doc, dry_run=dry_run, report=report)
161
+ report.domains_written += 1
162
+
163
+ # Entities
164
+ for ent in entities:
165
+ orig_name = str(ent["name"])
166
+ snake = _snake(orig_name)
167
+ entity_doc: Dict[str, Any] = {
168
+ "kind": "entity",
169
+ "layer": "physical",
170
+ "dialect": default_dialect,
171
+ "name": snake,
172
+ }
173
+ if orig_name != snake:
174
+ entity_doc["physical_name"] = orig_name
175
+ if ent.get("description"):
176
+ entity_doc["description"] = ent["description"]
177
+ if ent.get("owner"):
178
+ entity_doc["owner"] = ent["owner"]
179
+ if ent.get("schema"):
180
+ entity_doc["schema"] = ent["schema"]
181
+ if ent.get("database"):
182
+ entity_doc["database"] = ent["database"]
183
+ if ent.get("subject_area"):
184
+ entity_doc["subject_area"] = ent["subject_area"]
185
+ if ent.get("tags"):
186
+ entity_doc["tags"] = [_kebab(str(t)) for t in ent["tags"]]
187
+ if ent.get("partition_by"):
188
+ entity_doc["partition_by"] = ent["partition_by"]
189
+ if ent.get("cluster_by"):
190
+ entity_doc["cluster_by"] = ent["cluster_by"]
191
+
192
+ cls_for_entity = governance.get(orig_name, {}) or {}
193
+
194
+ # columns
195
+ cols: List[Dict[str, Any]] = []
196
+ for f in ent.get("fields") or []:
197
+ col: Dict[str, Any] = {"name": f["name"], "type": _translate_type(f.get("type", "string"))}
198
+ if f.get("description"):
199
+ col["description"] = f["description"]
200
+ if f.get("nullable") is not None:
201
+ col["nullable"] = bool(f["nullable"])
202
+ if f.get("primary_key"):
203
+ col["primary_key"] = True
204
+ if f.get("unique"):
205
+ col["unique"] = True
206
+ if f.get("default") is not None:
207
+ col["default"] = f["default"]
208
+ if f.get("sensitivity"):
209
+ col["sensitivity"] = f["sensitivity"]
210
+ if f.get("deprecated"):
211
+ col["deprecated"] = True
212
+ if f.get("examples"):
213
+ col["examples"] = f["examples"]
214
+ if f.get("tags"):
215
+ col["tags"] = [_kebab(str(t)) for t in f["tags"]]
216
+
217
+ # Governance classification at the column
218
+ if isinstance(cls_for_entity, dict):
219
+ sens = cls_for_entity.get(f["name"])
220
+ if sens and "sensitivity" not in col:
221
+ col["sensitivity"] = sens.lower()
222
+
223
+ # Check constraints become explicit constraint items
224
+ if f.get("check"):
225
+ col.setdefault("constraints", []).append({
226
+ "type": "check",
227
+ "expression": f["check"],
228
+ })
229
+
230
+ # v3 relationships → references
231
+ rel = rel_by_child.get((orig_name, f["name"]))
232
+ if rel:
233
+ col["references"] = rel
234
+
235
+ cols.append(col)
236
+ entity_doc["columns"] = cols
237
+
238
+ # Indexes owned by this entity
239
+ ent_indexes: List[Dict[str, Any]] = []
240
+ for idx in indexes:
241
+ if idx.get("entity") == orig_name:
242
+ ent_indexes.append({
243
+ "name": idx["name"],
244
+ "columns": list(idx.get("fields", [])),
245
+ **({"unique": True} if idx.get("unique") else {}),
246
+ **({"type": idx["type"]} if idx.get("type") else {}),
247
+ })
248
+ if ent_indexes:
249
+ entity_doc["indexes"] = ent_indexes
250
+
251
+ # Preserve anything else we didn't explicitly migrate under meta.datalex.v3
252
+ preserved: Dict[str, Any] = {}
253
+ for key in (
254
+ "grain", "candidate_keys", "business_keys", "hash_key", "sla",
255
+ "scd_type", "natural_key", "surrogate_key", "conformed",
256
+ "subtype_of", "subtypes", "dimension_refs", "link_refs",
257
+ "parent_entity", "hash_diff_fields", "load_timestamp_field",
258
+ "record_source_field", "distribution", "storage", "template", "templates",
259
+ "physical_name",
260
+ ):
261
+ if key in ent and key != "physical_name":
262
+ preserved[key] = ent[key]
263
+ if preserved:
264
+ entity_doc.setdefault("meta", {}).setdefault("datalex", {})["v3"] = preserved
265
+
266
+ subdir = ent.get("subject_area") or default_dialect
267
+ # Directory layout: models/physical/<dialect>/<entity>.yaml, with subject_area
268
+ # as an optional sub-group.
269
+ out_path = root / "models" / "physical" / default_dialect / f"{snake}.yaml"
270
+ _write_yaml(out_path, entity_doc, dry_run=dry_run, report=report)
271
+ report.entities_written += 1
272
+
273
+ return report
274
+
275
+
276
+ def _index_relationships_by_child(
277
+ relationships: List[Dict[str, Any]],
278
+ entities: List[Dict[str, Any]],
279
+ ) -> Dict[Tuple[str, str], Dict[str, Any]]:
280
+ """Return { (child_entity_pascal, child_field_snake): references_dict }.
281
+
282
+ v3 encodes relationships as top-level objects with from/to = "Entity.field".
283
+ The child side (the one with the FK column) depends on cardinality:
284
+ one_to_many => 'to' is the many side, which is the child
285
+ many_to_one => 'from' is the many side, which is the child
286
+ one_to_one => prefer the non-PK side; fall back to 'from'
287
+ many_to_many => we cannot express in a single column; skip with a warning
288
+ (the join entity typically already has both FKs declared
289
+ at the column level in the migrator output anyway)
290
+ """
291
+ by_child: Dict[Tuple[str, str], Dict[str, Any]] = {}
292
+ for rel in relationships:
293
+ card = rel.get("cardinality")
294
+ frm = rel.get("from", "")
295
+ to = rel.get("to", "")
296
+ if "." not in frm or "." not in to:
297
+ continue
298
+ from_entity, from_field = frm.split(".", 1)
299
+ to_entity, to_field = to.split(".", 1)
300
+
301
+ if card == "many_to_one":
302
+ child = (from_entity, from_field)
303
+ parent = (to_entity, to_field)
304
+ elif card == "one_to_many":
305
+ child = (to_entity, to_field)
306
+ parent = (from_entity, from_field)
307
+ elif card == "one_to_one":
308
+ child = (from_entity, from_field)
309
+ parent = (to_entity, to_field)
310
+ else: # many_to_many — not representable as a single FK
311
+ continue
312
+
313
+ ref = {
314
+ "entity": _snake(parent[0]),
315
+ "column": parent[1],
316
+ }
317
+ if rel.get("on_delete"):
318
+ ref["on_delete"] = rel["on_delete"]
319
+ if rel.get("on_update"):
320
+ ref["on_update"] = rel["on_update"]
321
+ ref["relationship"] = _rel_from_cardinality(card)
322
+ by_child[child] = ref
323
+
324
+ return by_child
325
+
326
+
327
+ def _rel_from_cardinality(card: Optional[str]) -> str:
328
+ return {
329
+ "many_to_one": "many_to_one",
330
+ "one_to_many": "many_to_one",
331
+ "one_to_one": "one_to_one",
332
+ "many_to_many": "many_to_many",
333
+ }.get(card or "", "many_to_one")
334
+
335
+
336
+ _V3_TYPE_MAP = {
337
+ "string": "string",
338
+ "text": "text",
339
+ "integer": "integer",
340
+ "int": "integer",
341
+ "bigint": "bigint",
342
+ "float": "float",
343
+ "double": "float",
344
+ "boolean": "boolean",
345
+ "bool": "boolean",
346
+ "date": "date",
347
+ "timestamp": "timestamp",
348
+ "datetime": "timestamp",
349
+ "timestamp_tz": "timestamp_tz",
350
+ "timestamptz": "timestamp_tz",
351
+ "uuid": "uuid",
352
+ "json": "json",
353
+ "jsonb": "json",
354
+ "binary": "binary",
355
+ }
356
+
357
+
358
+ def _translate_type(t: str) -> str:
359
+ raw = (t or "").strip()
360
+ lower = raw.lower()
361
+ if lower.startswith("decimal"):
362
+ return lower
363
+ return _V3_TYPE_MAP.get(lower, raw)
364
+
365
+
366
+ def _snake(name: str) -> str:
367
+ s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
368
+ s2 = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
369
+ return re.sub(r"[^a-z0-9_]", "_", s2).strip("_") or name.lower()
370
+
371
+
372
+ def _kebab(name: str) -> str:
373
+ return _snake(name).replace("_", "-")
374
+
375
+
376
+ def _write_yaml(path: Path, doc: Dict[str, Any], dry_run: bool, report: MigrationReport) -> None:
377
+ report.files.append(str(path))
378
+ if dry_run:
379
+ return
380
+ path.parent.mkdir(parents=True, exist_ok=True)
381
+ with path.open("w", encoding="utf-8") as f:
382
+ yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False, allow_unicode=True)
@@ -0,0 +1,102 @@
1
+ """Content-hash parse cache for DataLex YAML files.
2
+
3
+ The loader parses YAML, validates against a JSON Schema, and strips source
4
+ marks — work that is deterministic given (file bytes, schema bytes). For a
5
+ 10K-entity project, reparsing every file on every validate / diff / emit is
6
+ the dominant cost. This cache eliminates it.
7
+
8
+ Cache layout:
9
+ <cache_root>/<content_sha>__<schema_sha>.json
10
+
11
+ where:
12
+ content_sha is sha256(file bytes)
13
+ schema_sha is sha256(schema bytes for the file's declared kind)
14
+
15
+ The cached payload is a JSON dump of the already-validated, mark-stripped
16
+ document. We store JSON (not pickle) so the cache survives across Python
17
+ versions and is inspectable by humans.
18
+
19
+ Opt-in: set `DATALEX_CACHE=1` in the environment, or pass
20
+ `cache_dir=<path>` to `load_project`. Cache is keyed purely by content hash
21
+ so stale entries are never served — if the file changes by a single byte,
22
+ the cache key changes too.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import json
29
+ import os
30
+ from pathlib import Path
31
+ from typing import Any, Dict, Optional
32
+
33
+
34
+ class ParseCache:
35
+ """Disk-backed, content-addressed parse cache.
36
+
37
+ Safe to use from multiple processes: writes are atomic via rename.
38
+ Schema hash is lazily computed once per (schemas_root, kind).
39
+ """
40
+
41
+ def __init__(self, cache_dir: Path, schemas_root: Path) -> None:
42
+ self.cache_dir = cache_dir
43
+ self.schemas_root = schemas_root
44
+ self._schema_hashes: Dict[str, str] = {}
45
+ self.hits = 0
46
+ self.misses = 0
47
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ def _schema_hash(self, kind: str) -> str:
50
+ if kind in self._schema_hashes:
51
+ return self._schema_hashes[kind]
52
+ path = self.schemas_root / f"{kind}.schema.json"
53
+ if not path.exists():
54
+ self._schema_hashes[kind] = "no-schema"
55
+ return "no-schema"
56
+ h = hashlib.sha256(path.read_bytes()).hexdigest()
57
+ self._schema_hashes[kind] = h
58
+ return h
59
+
60
+ def _key(self, content_sha: str, kind: str) -> Path:
61
+ schema_sha = self._schema_hash(kind)
62
+ return self.cache_dir / f"{content_sha}__{schema_sha}.json"
63
+
64
+ def get(self, path: Path, kind: str) -> Optional[Dict[str, Any]]:
65
+ content_sha = hashlib.sha256(path.read_bytes()).hexdigest()
66
+ key = self._key(content_sha, kind)
67
+ if not key.exists():
68
+ self.misses += 1
69
+ return None
70
+ try:
71
+ with key.open("r", encoding="utf-8") as f:
72
+ self.hits += 1
73
+ return json.load(f)
74
+ except (OSError, json.JSONDecodeError):
75
+ # corrupt entry — treat as miss
76
+ self.misses += 1
77
+ return None
78
+
79
+ def put(self, path: Path, kind: str, doc: Dict[str, Any]) -> None:
80
+ content_sha = hashlib.sha256(path.read_bytes()).hexdigest()
81
+ key = self._key(content_sha, kind)
82
+ tmp = key.with_suffix(".json.tmp")
83
+ try:
84
+ with tmp.open("w", encoding="utf-8") as f:
85
+ json.dump(doc, f, sort_keys=True)
86
+ os.replace(tmp, key)
87
+ except OSError:
88
+ if tmp.exists():
89
+ tmp.unlink()
90
+
91
+ def summary(self) -> Dict[str, int]:
92
+ return {"hits": self.hits, "misses": self.misses}
93
+
94
+
95
+ def cache_enabled_from_env() -> bool:
96
+ return os.environ.get("DATALEX_CACHE", "").lower() in {"1", "true", "yes"}
97
+
98
+
99
+ def default_cache_dir(project_root: Path) -> Path:
100
+ """Return the per-project cache directory. Kept under the project, not $HOME,
101
+ so it's scoped to the checkout and easy to wipe (`rm -rf build/`)."""
102
+ return project_root / "build" / ".cache" / "datalex-parse"