datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
datalex_core/doctor.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Project health diagnostics for ``datalex doctor``.
|
|
2
|
+
|
|
3
|
+
Checks:
|
|
4
|
+
- Schema files exist and are valid JSON
|
|
5
|
+
- Policy schema exists and is valid JSON
|
|
6
|
+
- Model files are discoverable and parse as YAML
|
|
7
|
+
- Policy packs are discoverable and parse as YAML
|
|
8
|
+
- Python dependencies are importable
|
|
9
|
+
- CLI entry point is executable
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import importlib
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import subprocess
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, List, Tuple
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DiagnosticResult:
|
|
24
|
+
"""Single diagnostic check result."""
|
|
25
|
+
|
|
26
|
+
__slots__ = ("name", "status", "message")
|
|
27
|
+
|
|
28
|
+
def __init__(self, name: str, status: str, message: str = "") -> None:
|
|
29
|
+
self.name = name
|
|
30
|
+
self.status = status # "ok", "warn", "error"
|
|
31
|
+
self.message = message
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> Dict[str, str]:
|
|
34
|
+
return {"name": self.name, "status": self.status, "message": self.message}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _check_file_exists(path: Path, label: str) -> DiagnosticResult:
|
|
38
|
+
if path.exists():
|
|
39
|
+
return DiagnosticResult(label, "ok", str(path))
|
|
40
|
+
return DiagnosticResult(label, "error", f"Not found: {path}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_json_file(path: Path, label: str) -> DiagnosticResult:
|
|
44
|
+
if not path.exists():
|
|
45
|
+
return DiagnosticResult(label, "error", f"Not found: {path}")
|
|
46
|
+
try:
|
|
47
|
+
with path.open("r", encoding="utf-8") as f:
|
|
48
|
+
json.load(f)
|
|
49
|
+
return DiagnosticResult(label, "ok", str(path))
|
|
50
|
+
except (json.JSONDecodeError, OSError) as exc:
|
|
51
|
+
return DiagnosticResult(label, "error", f"Invalid JSON: {exc}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _check_yaml_file(path: Path, label: str) -> DiagnosticResult:
|
|
55
|
+
if not path.exists():
|
|
56
|
+
return DiagnosticResult(label, "error", f"Not found: {path}")
|
|
57
|
+
try:
|
|
58
|
+
with path.open("r", encoding="utf-8") as f:
|
|
59
|
+
yaml.safe_load(f)
|
|
60
|
+
return DiagnosticResult(label, "ok", str(path))
|
|
61
|
+
except (yaml.YAMLError, OSError) as exc:
|
|
62
|
+
return DiagnosticResult(label, "error", f"Invalid YAML: {exc}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _check_importable(module_name: str) -> DiagnosticResult:
|
|
66
|
+
try:
|
|
67
|
+
importlib.import_module(module_name)
|
|
68
|
+
return DiagnosticResult(f"import {module_name}", "ok")
|
|
69
|
+
except ImportError as exc:
|
|
70
|
+
return DiagnosticResult(f"import {module_name}", "error", str(exc))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _find_files(root: Path, pattern: str) -> List[Path]:
|
|
74
|
+
return sorted(root.glob(pattern))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def run_diagnostics(project_dir: str) -> List[DiagnosticResult]:
|
|
78
|
+
"""Run all project diagnostics and return results."""
|
|
79
|
+
root = Path(project_dir).resolve()
|
|
80
|
+
results: List[DiagnosticResult] = []
|
|
81
|
+
|
|
82
|
+
# 1. Project directory
|
|
83
|
+
if root.is_dir():
|
|
84
|
+
results.append(DiagnosticResult("project_directory", "ok", str(root)))
|
|
85
|
+
else:
|
|
86
|
+
results.append(DiagnosticResult("project_directory", "error", f"Not a directory: {root}"))
|
|
87
|
+
return results
|
|
88
|
+
|
|
89
|
+
# 2. Schema files
|
|
90
|
+
model_schema = root / "schemas" / "model.schema.json"
|
|
91
|
+
policy_schema = root / "schemas" / "policy.schema.json"
|
|
92
|
+
results.append(_check_json_file(model_schema, "model_schema"))
|
|
93
|
+
results.append(_check_json_file(policy_schema, "policy_schema"))
|
|
94
|
+
|
|
95
|
+
# 3. Model files
|
|
96
|
+
model_files = _find_files(root, "**/*.model.yaml")
|
|
97
|
+
model_files = [f for f in model_files if ".git" not in str(f) and "node_modules" not in str(f)]
|
|
98
|
+
if model_files:
|
|
99
|
+
results.append(DiagnosticResult("model_files", "ok", f"Found {len(model_files)} model file(s)"))
|
|
100
|
+
for mf in model_files:
|
|
101
|
+
results.append(_check_yaml_file(mf, f"model:{mf.relative_to(root)}"))
|
|
102
|
+
else:
|
|
103
|
+
results.append(DiagnosticResult("model_files", "warn", "No *.model.yaml files found"))
|
|
104
|
+
|
|
105
|
+
# 4. Policy packs
|
|
106
|
+
policy_files = _find_files(root / "policies", "*.policy.yaml")
|
|
107
|
+
if not policy_files:
|
|
108
|
+
policy_files = _find_files(root, "**/*.policy.yaml")
|
|
109
|
+
policy_files = [f for f in policy_files if ".git" not in str(f) and "node_modules" not in str(f)]
|
|
110
|
+
if policy_files:
|
|
111
|
+
results.append(DiagnosticResult("policy_packs", "ok", f"Found {len(policy_files)} policy pack(s)"))
|
|
112
|
+
for pf in policy_files:
|
|
113
|
+
results.append(_check_yaml_file(pf, f"policy:{pf.relative_to(root)}"))
|
|
114
|
+
else:
|
|
115
|
+
results.append(DiagnosticResult("policy_packs", "warn", "No *.policy.yaml files found"))
|
|
116
|
+
|
|
117
|
+
# 5. Python dependencies
|
|
118
|
+
for mod in ["yaml", "jsonschema"]:
|
|
119
|
+
results.append(_check_importable(mod))
|
|
120
|
+
|
|
121
|
+
# 6. datalex_core importable
|
|
122
|
+
results.append(_check_importable("datalex_core"))
|
|
123
|
+
|
|
124
|
+
# 7. CLI entry point
|
|
125
|
+
cli_path = root / "datalex"
|
|
126
|
+
if cli_path.exists():
|
|
127
|
+
results.append(DiagnosticResult("cli_entrypoint", "ok", str(cli_path)))
|
|
128
|
+
if os.access(str(cli_path), os.X_OK):
|
|
129
|
+
results.append(DiagnosticResult("cli_executable", "ok"))
|
|
130
|
+
else:
|
|
131
|
+
results.append(DiagnosticResult("cli_executable", "warn", "datalex is not executable (chmod +x datalex)"))
|
|
132
|
+
else:
|
|
133
|
+
results.append(DiagnosticResult("cli_entrypoint", "warn", "datalex script not found at project root"))
|
|
134
|
+
|
|
135
|
+
# 8. requirements.txt
|
|
136
|
+
req_path = root / "requirements.txt"
|
|
137
|
+
results.append(_check_file_exists(req_path, "requirements_txt"))
|
|
138
|
+
|
|
139
|
+
return results
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def format_diagnostics(results: List[DiagnosticResult]) -> str:
|
|
143
|
+
"""Format diagnostic results as a human-readable string."""
|
|
144
|
+
lines: List[str] = []
|
|
145
|
+
lines.append("DataLex Doctor")
|
|
146
|
+
lines.append("=" * 40)
|
|
147
|
+
|
|
148
|
+
ok_count = sum(1 for r in results if r.status == "ok")
|
|
149
|
+
warn_count = sum(1 for r in results if r.status == "warn")
|
|
150
|
+
error_count = sum(1 for r in results if r.status == "error")
|
|
151
|
+
|
|
152
|
+
for r in results:
|
|
153
|
+
icon = {"ok": "\u2713", "warn": "!", "error": "\u2717"}.get(r.status, "?")
|
|
154
|
+
msg = f" [{icon}] {r.name}"
|
|
155
|
+
if r.message:
|
|
156
|
+
msg += f" — {r.message}"
|
|
157
|
+
lines.append(msg)
|
|
158
|
+
|
|
159
|
+
lines.append("")
|
|
160
|
+
lines.append(f"Summary: {ok_count} ok, {warn_count} warnings, {error_count} errors")
|
|
161
|
+
|
|
162
|
+
if error_count > 0:
|
|
163
|
+
lines.append("Status: UNHEALTHY")
|
|
164
|
+
elif warn_count > 0:
|
|
165
|
+
lines.append("Status: OK (with warnings)")
|
|
166
|
+
else:
|
|
167
|
+
lines.append("Status: HEALTHY")
|
|
168
|
+
|
|
169
|
+
return "\n".join(lines)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def diagnostics_as_json(results: List[DiagnosticResult]) -> Dict[str, Any]:
|
|
173
|
+
"""Return diagnostics as a JSON-serializable dict."""
|
|
174
|
+
ok_count = sum(1 for r in results if r.status == "ok")
|
|
175
|
+
warn_count = sum(1 for r in results if r.status == "warn")
|
|
176
|
+
error_count = sum(1 for r in results if r.status == "error")
|
|
177
|
+
return {
|
|
178
|
+
"checks": [r.to_dict() for r in results],
|
|
179
|
+
"summary": {"ok": ok_count, "warn": warn_count, "error": error_count},
|
|
180
|
+
"healthy": error_count == 0,
|
|
181
|
+
}
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from datalex_core.modeling import normalize_model
|
|
6
|
+
|
|
7
|
+
SUPPORTED_DIALECTS = {"postgres", "snowflake", "bigquery", "databricks"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _to_snake(name: str) -> str:
|
|
11
|
+
out: List[str] = []
|
|
12
|
+
for idx, char in enumerate(name):
|
|
13
|
+
if char.isupper() and idx > 0 and (not name[idx - 1].isupper()):
|
|
14
|
+
out.append("_")
|
|
15
|
+
out.append(char.lower())
|
|
16
|
+
return "".join(out)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _sql_type(field_type: str, dialect: str) -> str:
|
|
20
|
+
value = field_type.strip().lower()
|
|
21
|
+
if value.startswith("decimal"):
|
|
22
|
+
return value.upper()
|
|
23
|
+
|
|
24
|
+
mapping_postgres = {
|
|
25
|
+
"string": "TEXT",
|
|
26
|
+
"integer": "INTEGER",
|
|
27
|
+
"bigint": "BIGINT",
|
|
28
|
+
"boolean": "BOOLEAN",
|
|
29
|
+
"date": "DATE",
|
|
30
|
+
"timestamp": "TIMESTAMP",
|
|
31
|
+
"float": "DOUBLE PRECISION",
|
|
32
|
+
"json": "JSONB",
|
|
33
|
+
"uuid": "UUID",
|
|
34
|
+
"text": "TEXT",
|
|
35
|
+
"binary": "BYTEA",
|
|
36
|
+
}
|
|
37
|
+
mapping_snowflake = {
|
|
38
|
+
"string": "VARCHAR",
|
|
39
|
+
"integer": "NUMBER",
|
|
40
|
+
"bigint": "NUMBER",
|
|
41
|
+
"boolean": "BOOLEAN",
|
|
42
|
+
"date": "DATE",
|
|
43
|
+
"timestamp": "TIMESTAMP_NTZ",
|
|
44
|
+
"float": "FLOAT",
|
|
45
|
+
"json": "VARIANT",
|
|
46
|
+
"uuid": "VARCHAR",
|
|
47
|
+
"text": "VARCHAR",
|
|
48
|
+
"binary": "BINARY",
|
|
49
|
+
}
|
|
50
|
+
mapping_bigquery = {
|
|
51
|
+
"string": "STRING",
|
|
52
|
+
"integer": "INT64",
|
|
53
|
+
"bigint": "INT64",
|
|
54
|
+
"boolean": "BOOL",
|
|
55
|
+
"date": "DATE",
|
|
56
|
+
"timestamp": "TIMESTAMP",
|
|
57
|
+
"float": "FLOAT64",
|
|
58
|
+
"json": "JSON",
|
|
59
|
+
"uuid": "STRING",
|
|
60
|
+
"text": "STRING",
|
|
61
|
+
"binary": "BYTES",
|
|
62
|
+
}
|
|
63
|
+
mapping_databricks = {
|
|
64
|
+
"string": "STRING",
|
|
65
|
+
"integer": "INT",
|
|
66
|
+
"bigint": "BIGINT",
|
|
67
|
+
"boolean": "BOOLEAN",
|
|
68
|
+
"date": "DATE",
|
|
69
|
+
"timestamp": "TIMESTAMP",
|
|
70
|
+
"float": "DOUBLE",
|
|
71
|
+
"json": "STRING",
|
|
72
|
+
"uuid": "STRING",
|
|
73
|
+
"text": "STRING",
|
|
74
|
+
"binary": "BINARY",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
mappings = {
|
|
78
|
+
"postgres": mapping_postgres,
|
|
79
|
+
"snowflake": mapping_snowflake,
|
|
80
|
+
"bigquery": mapping_bigquery,
|
|
81
|
+
"databricks": mapping_databricks,
|
|
82
|
+
}
|
|
83
|
+
mapping = mappings.get(dialect, mapping_postgres)
|
|
84
|
+
return mapping.get(value, field_type)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _qualified_name(entity: Dict[str, Any], dialect: str) -> str:
|
|
88
|
+
physical_name = entity.get("physical_name") or entity.get("physicalName")
|
|
89
|
+
inferred_physical = None
|
|
90
|
+
if not physical_name:
|
|
91
|
+
# Backward-compatible fallback: older connector pulls didn't store physical_name.
|
|
92
|
+
# Try to recover the warehouse identifier from the standard "Pulled from ..." description.
|
|
93
|
+
desc = str(entity.get("description") or "")
|
|
94
|
+
m = re.search(r"Pulled from Snowflake [^\s.]+\.[^\s.]+\.([^\s]+) on ", desc)
|
|
95
|
+
if m:
|
|
96
|
+
inferred_physical = m.group(1)
|
|
97
|
+
|
|
98
|
+
table_name = (
|
|
99
|
+
str(physical_name or inferred_physical).strip()
|
|
100
|
+
if (physical_name or inferred_physical)
|
|
101
|
+
else _to_snake(str(entity.get("name", "")))
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
schema_name = entity.get("schema")
|
|
105
|
+
database_name = entity.get("database")
|
|
106
|
+
|
|
107
|
+
# Snowflake treats quoted identifiers as case-sensitive; prefer uppercase identifiers by default
|
|
108
|
+
# so generated DDL matches warehouse naming conventions when physical_name isn't provided.
|
|
109
|
+
if dialect == "snowflake" and not (physical_name or inferred_physical):
|
|
110
|
+
table_name = table_name.upper()
|
|
111
|
+
|
|
112
|
+
if dialect == "bigquery":
|
|
113
|
+
parts = [p for p in [database_name, schema_name, table_name] if p]
|
|
114
|
+
return ".".join([f"`{p}`" for p in parts])
|
|
115
|
+
|
|
116
|
+
if database_name and schema_name:
|
|
117
|
+
return f'"{database_name}"."{schema_name}"."{table_name}"'
|
|
118
|
+
if schema_name:
|
|
119
|
+
return f'"{schema_name}"."{table_name}"'
|
|
120
|
+
return f'"{table_name}"'
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _format_default(value: Any, dialect: str) -> Optional[str]:
|
|
124
|
+
if value is None:
|
|
125
|
+
return "NULL"
|
|
126
|
+
if isinstance(value, bool):
|
|
127
|
+
return "TRUE" if value else "FALSE"
|
|
128
|
+
if isinstance(value, (int, float)):
|
|
129
|
+
return str(value)
|
|
130
|
+
return f"'{value}'"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def generate_sql_ddl(model: Dict[str, Any], dialect: str = "postgres") -> str:
|
|
134
|
+
model = normalize_model(model)
|
|
135
|
+
dialect = dialect.lower()
|
|
136
|
+
if dialect not in SUPPORTED_DIALECTS:
|
|
137
|
+
raise ValueError(f"Unsupported SQL dialect. Use one of: {', '.join(sorted(SUPPORTED_DIALECTS))}.")
|
|
138
|
+
|
|
139
|
+
entities = model.get("entities", [])
|
|
140
|
+
relationships = model.get("relationships", [])
|
|
141
|
+
indexes = model.get("indexes", [])
|
|
142
|
+
|
|
143
|
+
create_blocks: List[str] = []
|
|
144
|
+
alter_blocks: List[str] = []
|
|
145
|
+
index_blocks: List[str] = []
|
|
146
|
+
|
|
147
|
+
entity_map = {str(e.get("name", "")): e for e in entities}
|
|
148
|
+
|
|
149
|
+
for entity in entities:
|
|
150
|
+
entity_type = entity.get("type", "table")
|
|
151
|
+
if entity_type in {"concept", "logical_entity"}:
|
|
152
|
+
entity_type = "table"
|
|
153
|
+
entity_name = str(entity.get("name", ""))
|
|
154
|
+
qualified = _qualified_name(entity, dialect)
|
|
155
|
+
fields = entity.get("fields", [])
|
|
156
|
+
|
|
157
|
+
if entity_type in ("view", "materialized_view"):
|
|
158
|
+
keyword = "MATERIALIZED VIEW" if entity_type == "materialized_view" else "VIEW"
|
|
159
|
+
col_list = ", ".join([f'NULL AS "{f.get("name")}"' for f in fields])
|
|
160
|
+
create_blocks.append(f"CREATE {keyword} {qualified} AS\nSELECT {col_list};")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
if entity_type == "external_table":
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
if entity_type == "snapshot":
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Build dimensional comment header for fact/dim/bridge tables
|
|
170
|
+
dim_header: Optional[str] = None
|
|
171
|
+
if entity_type == "fact_table":
|
|
172
|
+
grain = entity.get("grain", [])
|
|
173
|
+
grain_str = ", ".join(grain) if grain else "not declared"
|
|
174
|
+
dim_refs = entity.get("dimension_refs", [])
|
|
175
|
+
dims_str = ", ".join(dim_refs) if dim_refs else "none declared"
|
|
176
|
+
dim_header = (
|
|
177
|
+
f"-- Fact table: {entity_name}\n"
|
|
178
|
+
f"-- Grain: {grain_str}\n"
|
|
179
|
+
f"-- Dimension references: {dims_str}"
|
|
180
|
+
)
|
|
181
|
+
elif entity_type == "dimension_table":
|
|
182
|
+
scd_type = entity.get("scd_type")
|
|
183
|
+
natural_key = entity.get("natural_key") or "not declared"
|
|
184
|
+
conformed = entity.get("conformed", False)
|
|
185
|
+
scd_str = f"SCD Type {scd_type}" if scd_type else "SCD Type 1 (default)"
|
|
186
|
+
dim_header = (
|
|
187
|
+
f"-- Dimension table: {entity_name}\n"
|
|
188
|
+
f"-- Natural key: {natural_key}\n"
|
|
189
|
+
f"-- {scd_str}"
|
|
190
|
+
+ ("\n-- CONFORMED: shared across multiple fact tables" if conformed else "")
|
|
191
|
+
)
|
|
192
|
+
elif entity_type == "bridge_table":
|
|
193
|
+
dim_header = f"-- Bridge table: {entity_name} (many-to-many resolution)"
|
|
194
|
+
elif entity_type == "hub":
|
|
195
|
+
business_keys = entity.get("business_keys", [])
|
|
196
|
+
business_key_str = ", ".join("/".join(keyset) for keyset in business_keys) if business_keys else "not declared"
|
|
197
|
+
dim_header = (
|
|
198
|
+
f"-- Data Vault Hub: {entity_name}\n"
|
|
199
|
+
f"-- Business keys: {business_key_str}\n"
|
|
200
|
+
f"-- Hash key: {entity.get('hash_key') or 'not declared'}"
|
|
201
|
+
)
|
|
202
|
+
elif entity_type == "link":
|
|
203
|
+
link_refs = entity.get("link_refs", [])
|
|
204
|
+
link_ref_str = ", ".join(link_refs) if link_refs else "not declared"
|
|
205
|
+
dim_header = (
|
|
206
|
+
f"-- Data Vault Link: {entity_name}\n"
|
|
207
|
+
f"-- References: {link_ref_str}\n"
|
|
208
|
+
f"-- Hash key: {entity.get('hash_key') or 'not declared'}"
|
|
209
|
+
)
|
|
210
|
+
elif entity_type == "satellite":
|
|
211
|
+
hash_diff = entity.get("hash_diff_fields", [])
|
|
212
|
+
hash_diff_str = ", ".join(hash_diff) if hash_diff else "not declared"
|
|
213
|
+
dim_header = (
|
|
214
|
+
f"-- Data Vault Satellite: {entity_name}\n"
|
|
215
|
+
f"-- Parent: {entity.get('parent_entity') or 'not declared'}\n"
|
|
216
|
+
f"-- Hash diff fields: {hash_diff_str}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
column_lines: List[str] = []
|
|
220
|
+
pk_fields: List[str] = []
|
|
221
|
+
check_constraints: List[str] = []
|
|
222
|
+
|
|
223
|
+
for field in fields:
|
|
224
|
+
if field.get("computed") is True:
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
field_name = str(field.get("name", ""))
|
|
228
|
+
col_type = _sql_type(str(field.get("type", "string")), dialect)
|
|
229
|
+
nullable = bool(field.get("nullable", True))
|
|
230
|
+
unique = bool(field.get("unique", False))
|
|
231
|
+
primary_key = bool(field.get("primary_key", False))
|
|
232
|
+
|
|
233
|
+
parts = [f'"{field_name}"', col_type]
|
|
234
|
+
|
|
235
|
+
default_val = field.get("default")
|
|
236
|
+
if "default" in field:
|
|
237
|
+
formatted = _format_default(default_val, dialect)
|
|
238
|
+
if formatted is not None:
|
|
239
|
+
parts.append(f"DEFAULT {formatted}")
|
|
240
|
+
|
|
241
|
+
if not nullable:
|
|
242
|
+
parts.append("NOT NULL")
|
|
243
|
+
if unique:
|
|
244
|
+
parts.append("UNIQUE")
|
|
245
|
+
if primary_key:
|
|
246
|
+
pk_fields.append(field_name)
|
|
247
|
+
|
|
248
|
+
column_lines.append(" " + " ".join(parts))
|
|
249
|
+
|
|
250
|
+
check_expr = field.get("check")
|
|
251
|
+
if check_expr:
|
|
252
|
+
constraint_name = f"chk_{_to_snake(entity_name)}_{field_name}"
|
|
253
|
+
check_constraints.append(
|
|
254
|
+
f' CONSTRAINT "{constraint_name}" CHECK ({check_expr})'
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if pk_fields:
|
|
258
|
+
pk_cols = ", ".join([f'"{col}"' for col in pk_fields])
|
|
259
|
+
column_lines.append(f" PRIMARY KEY ({pk_cols})")
|
|
260
|
+
|
|
261
|
+
column_lines.extend(check_constraints)
|
|
262
|
+
|
|
263
|
+
create_sql = f"CREATE TABLE {qualified} (\n" + ",\n".join(column_lines) + "\n);"
|
|
264
|
+
if dim_header:
|
|
265
|
+
create_sql = dim_header + "\n" + create_sql
|
|
266
|
+
create_blocks.append(create_sql)
|
|
267
|
+
|
|
268
|
+
for rel in relationships:
|
|
269
|
+
from_ref = str(rel.get("from", ""))
|
|
270
|
+
to_ref = str(rel.get("to", ""))
|
|
271
|
+
cardinality = str(rel.get("cardinality", "one_to_many"))
|
|
272
|
+
rel_name = str(rel.get("name", "relationship"))
|
|
273
|
+
|
|
274
|
+
if "." not in from_ref or "." not in to_ref:
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
from_entity, from_field = from_ref.split(".", 1)
|
|
278
|
+
to_entity, to_field = to_ref.split(".", 1)
|
|
279
|
+
|
|
280
|
+
if cardinality == "one_to_many":
|
|
281
|
+
parent_entity, parent_field = from_entity, from_field
|
|
282
|
+
child_entity, child_field = to_entity, to_field
|
|
283
|
+
elif cardinality == "many_to_one":
|
|
284
|
+
parent_entity, parent_field = to_entity, to_field
|
|
285
|
+
child_entity, child_field = from_entity, from_field
|
|
286
|
+
elif cardinality == "one_to_one":
|
|
287
|
+
parent_entity, parent_field = from_entity, from_field
|
|
288
|
+
child_entity, child_field = to_entity, to_field
|
|
289
|
+
else:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
constraint = f"fk_{_to_snake(rel_name)}"
|
|
293
|
+
child_qualified = _qualified_name(entity_map.get(child_entity, {"name": child_entity}), dialect)
|
|
294
|
+
parent_qualified = _qualified_name(entity_map.get(parent_entity, {"name": parent_entity}), dialect)
|
|
295
|
+
|
|
296
|
+
if dialect == "bigquery":
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
alter_sql = (
|
|
300
|
+
f"ALTER TABLE {child_qualified} "
|
|
301
|
+
f'ADD CONSTRAINT "{constraint}" FOREIGN KEY ("{child_field}") '
|
|
302
|
+
f'REFERENCES {parent_qualified} ("{parent_field}");'
|
|
303
|
+
)
|
|
304
|
+
alter_blocks.append(alter_sql)
|
|
305
|
+
|
|
306
|
+
for idx_def in indexes:
|
|
307
|
+
idx_name = idx_def.get("name", "")
|
|
308
|
+
idx_entity = idx_def.get("entity", "")
|
|
309
|
+
idx_fields = idx_def.get("fields", [])
|
|
310
|
+
idx_unique = idx_def.get("unique", False)
|
|
311
|
+
|
|
312
|
+
entity_obj = entity_map.get(idx_entity, {"name": idx_entity})
|
|
313
|
+
qualified = _qualified_name(entity_obj, dialect)
|
|
314
|
+
cols = ", ".join([f'"{f}"' for f in idx_fields])
|
|
315
|
+
unique_kw = "UNIQUE " if idx_unique else ""
|
|
316
|
+
|
|
317
|
+
if dialect == "bigquery":
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
index_blocks.append(
|
|
321
|
+
f'CREATE {unique_kw}INDEX "{idx_name}" ON {qualified} ({cols});'
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
blocks = create_blocks + alter_blocks + index_blocks
|
|
325
|
+
return "\n\n".join(blocks) + ("\n" if blocks else "")
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _dbt_source_table_name(entity_name: str) -> str:
|
|
329
|
+
return _to_snake(entity_name)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def dbt_scaffold_files(
|
|
333
|
+
model: Dict[str, Any],
|
|
334
|
+
source_name: str = "raw",
|
|
335
|
+
project_name: str = "data_modeling_mvp",
|
|
336
|
+
) -> List[Tuple[str, str]]:
|
|
337
|
+
entities = model.get("entities", [])
|
|
338
|
+
|
|
339
|
+
files: List[Tuple[str, str]] = []
|
|
340
|
+
dbt_project = (
|
|
341
|
+
f"name: {project_name}\n"
|
|
342
|
+
"version: 1.0.0\n"
|
|
343
|
+
"config-version: 2\n\n"
|
|
344
|
+
"profile: default\n\n"
|
|
345
|
+
"models:\n"
|
|
346
|
+
f" {project_name}:\n"
|
|
347
|
+
" staging:\n"
|
|
348
|
+
" +materialized: view\n"
|
|
349
|
+
)
|
|
350
|
+
files.append(("dbt_project.yml", dbt_project))
|
|
351
|
+
|
|
352
|
+
schema_lines = ["version: 2", "", "models:"]
|
|
353
|
+
|
|
354
|
+
for entity in entities:
|
|
355
|
+
entity_name = str(entity.get("name", ""))
|
|
356
|
+
entity_type = str(entity.get("type", "table"))
|
|
357
|
+
table_name = _dbt_source_table_name(entity_name)
|
|
358
|
+
# Use dimensional naming conventions for fact/dim/bridge tables
|
|
359
|
+
if entity_type == "fact_table":
|
|
360
|
+
model_name = f"fct_{table_name}"
|
|
361
|
+
elif entity_type == "dimension_table":
|
|
362
|
+
model_name = f"dim_{table_name}"
|
|
363
|
+
elif entity_type == "bridge_table":
|
|
364
|
+
model_name = f"brd_{table_name}"
|
|
365
|
+
elif entity_type == "hub":
|
|
366
|
+
model_name = f"hub_{table_name}"
|
|
367
|
+
elif entity_type == "link":
|
|
368
|
+
model_name = f"lnk_{table_name}"
|
|
369
|
+
elif entity_type == "satellite":
|
|
370
|
+
model_name = f"sat_{table_name}"
|
|
371
|
+
else:
|
|
372
|
+
model_name = f"stg_{table_name}"
|
|
373
|
+
fields = entity.get("fields", [])
|
|
374
|
+
|
|
375
|
+
sql = (
|
|
376
|
+
f"select\n "
|
|
377
|
+
+ ",\n ".join([f'"{field.get("name")}"' for field in fields])
|
|
378
|
+
+ f"\nfrom {{{{ source('{source_name}', '{table_name}') }}}}\n"
|
|
379
|
+
)
|
|
380
|
+
files.append((f"models/staging/{model_name}.sql", sql))
|
|
381
|
+
|
|
382
|
+
schema_lines.append(f" - name: {model_name}")
|
|
383
|
+
if entity.get("description"):
|
|
384
|
+
schema_lines.append(f" description: \"{entity.get('description')}\"")
|
|
385
|
+
entity_meta: List[str] = []
|
|
386
|
+
if entity.get("tags"):
|
|
387
|
+
entity_meta.append(f" tags: {entity['tags']}")
|
|
388
|
+
if entity.get("owner"):
|
|
389
|
+
entity_meta.append(f" owner: \"{entity['owner']}\"")
|
|
390
|
+
if entity.get("subject_area"):
|
|
391
|
+
entity_meta.append(f" subject_area: \"{entity['subject_area']}\"")
|
|
392
|
+
# Dimensional modeling metadata in dbt meta block
|
|
393
|
+
if entity_type in {"fact_table", "dimension_table", "bridge_table", "hub", "link", "satellite"}:
|
|
394
|
+
entity_meta.append(f" entity_type: \"{entity_type}\"")
|
|
395
|
+
if entity.get("scd_type"):
|
|
396
|
+
entity_meta.append(f" scd_type: {entity['scd_type']}")
|
|
397
|
+
if entity.get("natural_key"):
|
|
398
|
+
entity_meta.append(f" natural_key: \"{entity['natural_key']}\"")
|
|
399
|
+
if entity.get("conformed"):
|
|
400
|
+
entity_meta.append(" conformed: true")
|
|
401
|
+
if entity.get("dimension_refs"):
|
|
402
|
+
entity_meta.append(f" dimension_refs: {entity['dimension_refs']}")
|
|
403
|
+
if entity.get("business_keys"):
|
|
404
|
+
entity_meta.append(f" business_keys: {entity['business_keys']}")
|
|
405
|
+
if entity.get("hash_key"):
|
|
406
|
+
entity_meta.append(f" hash_key: \"{entity['hash_key']}\"")
|
|
407
|
+
if entity.get("link_refs"):
|
|
408
|
+
entity_meta.append(f" link_refs: {entity['link_refs']}")
|
|
409
|
+
if entity.get("parent_entity"):
|
|
410
|
+
entity_meta.append(f" parent_entity: \"{entity['parent_entity']}\"")
|
|
411
|
+
if entity.get("hash_diff_fields"):
|
|
412
|
+
entity_meta.append(f" hash_diff_fields: {entity['hash_diff_fields']}")
|
|
413
|
+
if entity_meta:
|
|
414
|
+
schema_lines.append(" meta:")
|
|
415
|
+
schema_lines.extend(entity_meta)
|
|
416
|
+
schema_lines.append(" columns:")
|
|
417
|
+
for field in fields:
|
|
418
|
+
field_name = str(field.get("name", ""))
|
|
419
|
+
schema_lines.append(f" - name: {field_name}")
|
|
420
|
+
description = str(field.get("description", "")).strip() or f"Field {field_name}"
|
|
421
|
+
schema_lines.append(f" description: \"{description}\"")
|
|
422
|
+
field_meta: List[str] = []
|
|
423
|
+
if field.get("sensitivity"):
|
|
424
|
+
field_meta.append(f" sensitivity: \"{field['sensitivity']}\"")
|
|
425
|
+
if field.get("tags"):
|
|
426
|
+
field_meta.append(f" tags: {field['tags']}")
|
|
427
|
+
if field.get("deprecated"):
|
|
428
|
+
field_meta.append(" deprecated: true")
|
|
429
|
+
if field_meta:
|
|
430
|
+
schema_lines.append(" meta:")
|
|
431
|
+
schema_lines.extend(field_meta)
|
|
432
|
+
tests: List[str] = []
|
|
433
|
+
if field.get("primary_key"):
|
|
434
|
+
tests.extend(["not_null", "unique"])
|
|
435
|
+
elif field.get("nullable") is False:
|
|
436
|
+
tests.append("not_null")
|
|
437
|
+
if tests:
|
|
438
|
+
schema_lines.append(" tests:")
|
|
439
|
+
for test_name in tests:
|
|
440
|
+
schema_lines.append(f" - {test_name}")
|
|
441
|
+
|
|
442
|
+
files.append(("models/staging/schema.yml", "\n".join(schema_lines) + "\n"))
|
|
443
|
+
|
|
444
|
+
source_schema = [
|
|
445
|
+
"version: 2",
|
|
446
|
+
"",
|
|
447
|
+
"sources:",
|
|
448
|
+
f" - name: {source_name}",
|
|
449
|
+
" schema: public",
|
|
450
|
+
" tables:",
|
|
451
|
+
]
|
|
452
|
+
for entity in entities:
|
|
453
|
+
table_name = _dbt_source_table_name(str(entity.get("name", "")))
|
|
454
|
+
source_schema.append(f" - name: {table_name}")
|
|
455
|
+
files.append(("models/sources.yml", "\n".join(source_schema) + "\n"))
|
|
456
|
+
|
|
457
|
+
return files
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def write_dbt_scaffold(
|
|
461
|
+
model: Dict[str, Any],
|
|
462
|
+
out_dir: str,
|
|
463
|
+
source_name: str = "raw",
|
|
464
|
+
project_name: str = "data_modeling_mvp",
|
|
465
|
+
) -> List[str]:
|
|
466
|
+
root = Path(out_dir)
|
|
467
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
468
|
+
|
|
469
|
+
created: List[str] = []
|
|
470
|
+
for rel_path, content in dbt_scaffold_files(
|
|
471
|
+
model=model, source_name=source_name, project_name=project_name
|
|
472
|
+
):
|
|
473
|
+
target = root / rel_path
|
|
474
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
475
|
+
target.write_text(content, encoding="utf-8")
|
|
476
|
+
created.append(str(target))
|
|
477
|
+
|
|
478
|
+
return created
|