datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
datalex_core/doctor.py ADDED
@@ -0,0 +1,181 @@
1
+ """Project health diagnostics for ``datalex doctor``.
2
+
3
+ Checks:
4
+ - Schema files exist and are valid JSON
5
+ - Policy schema exists and is valid JSON
6
+ - Model files are discoverable and parse as YAML
7
+ - Policy packs are discoverable and parse as YAML
8
+ - Python dependencies are importable
9
+ - CLI entry point is executable
10
+ """
11
+
12
+ import importlib
13
+ import json
14
+ import os
15
+ import subprocess
16
+ import sys
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Tuple
19
+
20
+ import yaml
21
+
22
+
23
+ class DiagnosticResult:
24
+ """Single diagnostic check result."""
25
+
26
+ __slots__ = ("name", "status", "message")
27
+
28
+ def __init__(self, name: str, status: str, message: str = "") -> None:
29
+ self.name = name
30
+ self.status = status # "ok", "warn", "error"
31
+ self.message = message
32
+
33
+ def to_dict(self) -> Dict[str, str]:
34
+ return {"name": self.name, "status": self.status, "message": self.message}
35
+
36
+
37
+ def _check_file_exists(path: Path, label: str) -> DiagnosticResult:
38
+ if path.exists():
39
+ return DiagnosticResult(label, "ok", str(path))
40
+ return DiagnosticResult(label, "error", f"Not found: {path}")
41
+
42
+
43
+ def _check_json_file(path: Path, label: str) -> DiagnosticResult:
44
+ if not path.exists():
45
+ return DiagnosticResult(label, "error", f"Not found: {path}")
46
+ try:
47
+ with path.open("r", encoding="utf-8") as f:
48
+ json.load(f)
49
+ return DiagnosticResult(label, "ok", str(path))
50
+ except (json.JSONDecodeError, OSError) as exc:
51
+ return DiagnosticResult(label, "error", f"Invalid JSON: {exc}")
52
+
53
+
54
+ def _check_yaml_file(path: Path, label: str) -> DiagnosticResult:
55
+ if not path.exists():
56
+ return DiagnosticResult(label, "error", f"Not found: {path}")
57
+ try:
58
+ with path.open("r", encoding="utf-8") as f:
59
+ yaml.safe_load(f)
60
+ return DiagnosticResult(label, "ok", str(path))
61
+ except (yaml.YAMLError, OSError) as exc:
62
+ return DiagnosticResult(label, "error", f"Invalid YAML: {exc}")
63
+
64
+
65
+ def _check_importable(module_name: str) -> DiagnosticResult:
66
+ try:
67
+ importlib.import_module(module_name)
68
+ return DiagnosticResult(f"import {module_name}", "ok")
69
+ except ImportError as exc:
70
+ return DiagnosticResult(f"import {module_name}", "error", str(exc))
71
+
72
+
73
+ def _find_files(root: Path, pattern: str) -> List[Path]:
74
+ return sorted(root.glob(pattern))
75
+
76
+
77
+ def run_diagnostics(project_dir: str) -> List[DiagnosticResult]:
78
+ """Run all project diagnostics and return results."""
79
+ root = Path(project_dir).resolve()
80
+ results: List[DiagnosticResult] = []
81
+
82
+ # 1. Project directory
83
+ if root.is_dir():
84
+ results.append(DiagnosticResult("project_directory", "ok", str(root)))
85
+ else:
86
+ results.append(DiagnosticResult("project_directory", "error", f"Not a directory: {root}"))
87
+ return results
88
+
89
+ # 2. Schema files
90
+ model_schema = root / "schemas" / "model.schema.json"
91
+ policy_schema = root / "schemas" / "policy.schema.json"
92
+ results.append(_check_json_file(model_schema, "model_schema"))
93
+ results.append(_check_json_file(policy_schema, "policy_schema"))
94
+
95
+ # 3. Model files
96
+ model_files = _find_files(root, "**/*.model.yaml")
97
+ model_files = [f for f in model_files if ".git" not in str(f) and "node_modules" not in str(f)]
98
+ if model_files:
99
+ results.append(DiagnosticResult("model_files", "ok", f"Found {len(model_files)} model file(s)"))
100
+ for mf in model_files:
101
+ results.append(_check_yaml_file(mf, f"model:{mf.relative_to(root)}"))
102
+ else:
103
+ results.append(DiagnosticResult("model_files", "warn", "No *.model.yaml files found"))
104
+
105
+ # 4. Policy packs
106
+ policy_files = _find_files(root / "policies", "*.policy.yaml")
107
+ if not policy_files:
108
+ policy_files = _find_files(root, "**/*.policy.yaml")
109
+ policy_files = [f for f in policy_files if ".git" not in str(f) and "node_modules" not in str(f)]
110
+ if policy_files:
111
+ results.append(DiagnosticResult("policy_packs", "ok", f"Found {len(policy_files)} policy pack(s)"))
112
+ for pf in policy_files:
113
+ results.append(_check_yaml_file(pf, f"policy:{pf.relative_to(root)}"))
114
+ else:
115
+ results.append(DiagnosticResult("policy_packs", "warn", "No *.policy.yaml files found"))
116
+
117
+ # 5. Python dependencies
118
+ for mod in ["yaml", "jsonschema"]:
119
+ results.append(_check_importable(mod))
120
+
121
+ # 6. datalex_core importable
122
+ results.append(_check_importable("datalex_core"))
123
+
124
+ # 7. CLI entry point
125
+ cli_path = root / "datalex"
126
+ if cli_path.exists():
127
+ results.append(DiagnosticResult("cli_entrypoint", "ok", str(cli_path)))
128
+ if os.access(str(cli_path), os.X_OK):
129
+ results.append(DiagnosticResult("cli_executable", "ok"))
130
+ else:
131
+ results.append(DiagnosticResult("cli_executable", "warn", "datalex is not executable (chmod +x datalex)"))
132
+ else:
133
+ results.append(DiagnosticResult("cli_entrypoint", "warn", "datalex script not found at project root"))
134
+
135
+ # 8. requirements.txt
136
+ req_path = root / "requirements.txt"
137
+ results.append(_check_file_exists(req_path, "requirements_txt"))
138
+
139
+ return results
140
+
141
+
142
+ def format_diagnostics(results: List[DiagnosticResult]) -> str:
143
+ """Format diagnostic results as a human-readable string."""
144
+ lines: List[str] = []
145
+ lines.append("DataLex Doctor")
146
+ lines.append("=" * 40)
147
+
148
+ ok_count = sum(1 for r in results if r.status == "ok")
149
+ warn_count = sum(1 for r in results if r.status == "warn")
150
+ error_count = sum(1 for r in results if r.status == "error")
151
+
152
+ for r in results:
153
+ icon = {"ok": "\u2713", "warn": "!", "error": "\u2717"}.get(r.status, "?")
154
+ msg = f" [{icon}] {r.name}"
155
+ if r.message:
156
+ msg += f" — {r.message}"
157
+ lines.append(msg)
158
+
159
+ lines.append("")
160
+ lines.append(f"Summary: {ok_count} ok, {warn_count} warnings, {error_count} errors")
161
+
162
+ if error_count > 0:
163
+ lines.append("Status: UNHEALTHY")
164
+ elif warn_count > 0:
165
+ lines.append("Status: OK (with warnings)")
166
+ else:
167
+ lines.append("Status: HEALTHY")
168
+
169
+ return "\n".join(lines)
170
+
171
+
172
+ def diagnostics_as_json(results: List[DiagnosticResult]) -> Dict[str, Any]:
173
+ """Return diagnostics as a JSON-serializable dict."""
174
+ ok_count = sum(1 for r in results if r.status == "ok")
175
+ warn_count = sum(1 for r in results if r.status == "warn")
176
+ error_count = sum(1 for r in results if r.status == "error")
177
+ return {
178
+ "checks": [r.to_dict() for r in results],
179
+ "summary": {"ok": ok_count, "warn": warn_count, "error": error_count},
180
+ "healthy": error_count == 0,
181
+ }
@@ -0,0 +1,478 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ from datalex_core.modeling import normalize_model
6
+
7
+ SUPPORTED_DIALECTS = {"postgres", "snowflake", "bigquery", "databricks"}
8
+
9
+
10
+ def _to_snake(name: str) -> str:
11
+ out: List[str] = []
12
+ for idx, char in enumerate(name):
13
+ if char.isupper() and idx > 0 and (not name[idx - 1].isupper()):
14
+ out.append("_")
15
+ out.append(char.lower())
16
+ return "".join(out)
17
+
18
+
19
+ def _sql_type(field_type: str, dialect: str) -> str:
20
+ value = field_type.strip().lower()
21
+ if value.startswith("decimal"):
22
+ return value.upper()
23
+
24
+ mapping_postgres = {
25
+ "string": "TEXT",
26
+ "integer": "INTEGER",
27
+ "bigint": "BIGINT",
28
+ "boolean": "BOOLEAN",
29
+ "date": "DATE",
30
+ "timestamp": "TIMESTAMP",
31
+ "float": "DOUBLE PRECISION",
32
+ "json": "JSONB",
33
+ "uuid": "UUID",
34
+ "text": "TEXT",
35
+ "binary": "BYTEA",
36
+ }
37
+ mapping_snowflake = {
38
+ "string": "VARCHAR",
39
+ "integer": "NUMBER",
40
+ "bigint": "NUMBER",
41
+ "boolean": "BOOLEAN",
42
+ "date": "DATE",
43
+ "timestamp": "TIMESTAMP_NTZ",
44
+ "float": "FLOAT",
45
+ "json": "VARIANT",
46
+ "uuid": "VARCHAR",
47
+ "text": "VARCHAR",
48
+ "binary": "BINARY",
49
+ }
50
+ mapping_bigquery = {
51
+ "string": "STRING",
52
+ "integer": "INT64",
53
+ "bigint": "INT64",
54
+ "boolean": "BOOL",
55
+ "date": "DATE",
56
+ "timestamp": "TIMESTAMP",
57
+ "float": "FLOAT64",
58
+ "json": "JSON",
59
+ "uuid": "STRING",
60
+ "text": "STRING",
61
+ "binary": "BYTES",
62
+ }
63
+ mapping_databricks = {
64
+ "string": "STRING",
65
+ "integer": "INT",
66
+ "bigint": "BIGINT",
67
+ "boolean": "BOOLEAN",
68
+ "date": "DATE",
69
+ "timestamp": "TIMESTAMP",
70
+ "float": "DOUBLE",
71
+ "json": "STRING",
72
+ "uuid": "STRING",
73
+ "text": "STRING",
74
+ "binary": "BINARY",
75
+ }
76
+
77
+ mappings = {
78
+ "postgres": mapping_postgres,
79
+ "snowflake": mapping_snowflake,
80
+ "bigquery": mapping_bigquery,
81
+ "databricks": mapping_databricks,
82
+ }
83
+ mapping = mappings.get(dialect, mapping_postgres)
84
+ return mapping.get(value, field_type)
85
+
86
+
87
+ def _qualified_name(entity: Dict[str, Any], dialect: str) -> str:
88
+ physical_name = entity.get("physical_name") or entity.get("physicalName")
89
+ inferred_physical = None
90
+ if not physical_name:
91
+ # Backward-compatible fallback: older connector pulls didn't store physical_name.
92
+ # Try to recover the warehouse identifier from the standard "Pulled from ..." description.
93
+ desc = str(entity.get("description") or "")
94
+ m = re.search(r"Pulled from Snowflake [^\s.]+\.[^\s.]+\.([^\s]+) on ", desc)
95
+ if m:
96
+ inferred_physical = m.group(1)
97
+
98
+ table_name = (
99
+ str(physical_name or inferred_physical).strip()
100
+ if (physical_name or inferred_physical)
101
+ else _to_snake(str(entity.get("name", "")))
102
+ )
103
+
104
+ schema_name = entity.get("schema")
105
+ database_name = entity.get("database")
106
+
107
+ # Snowflake treats quoted identifiers as case-sensitive; prefer uppercase identifiers by default
108
+ # so generated DDL matches warehouse naming conventions when physical_name isn't provided.
109
+ if dialect == "snowflake" and not (physical_name or inferred_physical):
110
+ table_name = table_name.upper()
111
+
112
+ if dialect == "bigquery":
113
+ parts = [p for p in [database_name, schema_name, table_name] if p]
114
+ return ".".join([f"`{p}`" for p in parts])
115
+
116
+ if database_name and schema_name:
117
+ return f'"{database_name}"."{schema_name}"."{table_name}"'
118
+ if schema_name:
119
+ return f'"{schema_name}"."{table_name}"'
120
+ return f'"{table_name}"'
121
+
122
+
123
+ def _format_default(value: Any, dialect: str) -> Optional[str]:
124
+ if value is None:
125
+ return "NULL"
126
+ if isinstance(value, bool):
127
+ return "TRUE" if value else "FALSE"
128
+ if isinstance(value, (int, float)):
129
+ return str(value)
130
+ return f"'{value}'"
131
+
132
+
133
+ def generate_sql_ddl(model: Dict[str, Any], dialect: str = "postgres") -> str:
134
+ model = normalize_model(model)
135
+ dialect = dialect.lower()
136
+ if dialect not in SUPPORTED_DIALECTS:
137
+ raise ValueError(f"Unsupported SQL dialect. Use one of: {', '.join(sorted(SUPPORTED_DIALECTS))}.")
138
+
139
+ entities = model.get("entities", [])
140
+ relationships = model.get("relationships", [])
141
+ indexes = model.get("indexes", [])
142
+
143
+ create_blocks: List[str] = []
144
+ alter_blocks: List[str] = []
145
+ index_blocks: List[str] = []
146
+
147
+ entity_map = {str(e.get("name", "")): e for e in entities}
148
+
149
+ for entity in entities:
150
+ entity_type = entity.get("type", "table")
151
+ if entity_type in {"concept", "logical_entity"}:
152
+ entity_type = "table"
153
+ entity_name = str(entity.get("name", ""))
154
+ qualified = _qualified_name(entity, dialect)
155
+ fields = entity.get("fields", [])
156
+
157
+ if entity_type in ("view", "materialized_view"):
158
+ keyword = "MATERIALIZED VIEW" if entity_type == "materialized_view" else "VIEW"
159
+ col_list = ", ".join([f'NULL AS "{f.get("name")}"' for f in fields])
160
+ create_blocks.append(f"CREATE {keyword} {qualified} AS\nSELECT {col_list};")
161
+ continue
162
+
163
+ if entity_type == "external_table":
164
+ continue
165
+
166
+ if entity_type == "snapshot":
167
+ continue
168
+
169
+ # Build dimensional comment header for fact/dim/bridge tables
170
+ dim_header: Optional[str] = None
171
+ if entity_type == "fact_table":
172
+ grain = entity.get("grain", [])
173
+ grain_str = ", ".join(grain) if grain else "not declared"
174
+ dim_refs = entity.get("dimension_refs", [])
175
+ dims_str = ", ".join(dim_refs) if dim_refs else "none declared"
176
+ dim_header = (
177
+ f"-- Fact table: {entity_name}\n"
178
+ f"-- Grain: {grain_str}\n"
179
+ f"-- Dimension references: {dims_str}"
180
+ )
181
+ elif entity_type == "dimension_table":
182
+ scd_type = entity.get("scd_type")
183
+ natural_key = entity.get("natural_key") or "not declared"
184
+ conformed = entity.get("conformed", False)
185
+ scd_str = f"SCD Type {scd_type}" if scd_type else "SCD Type 1 (default)"
186
+ dim_header = (
187
+ f"-- Dimension table: {entity_name}\n"
188
+ f"-- Natural key: {natural_key}\n"
189
+ f"-- {scd_str}"
190
+ + ("\n-- CONFORMED: shared across multiple fact tables" if conformed else "")
191
+ )
192
+ elif entity_type == "bridge_table":
193
+ dim_header = f"-- Bridge table: {entity_name} (many-to-many resolution)"
194
+ elif entity_type == "hub":
195
+ business_keys = entity.get("business_keys", [])
196
+ business_key_str = ", ".join("/".join(keyset) for keyset in business_keys) if business_keys else "not declared"
197
+ dim_header = (
198
+ f"-- Data Vault Hub: {entity_name}\n"
199
+ f"-- Business keys: {business_key_str}\n"
200
+ f"-- Hash key: {entity.get('hash_key') or 'not declared'}"
201
+ )
202
+ elif entity_type == "link":
203
+ link_refs = entity.get("link_refs", [])
204
+ link_ref_str = ", ".join(link_refs) if link_refs else "not declared"
205
+ dim_header = (
206
+ f"-- Data Vault Link: {entity_name}\n"
207
+ f"-- References: {link_ref_str}\n"
208
+ f"-- Hash key: {entity.get('hash_key') or 'not declared'}"
209
+ )
210
+ elif entity_type == "satellite":
211
+ hash_diff = entity.get("hash_diff_fields", [])
212
+ hash_diff_str = ", ".join(hash_diff) if hash_diff else "not declared"
213
+ dim_header = (
214
+ f"-- Data Vault Satellite: {entity_name}\n"
215
+ f"-- Parent: {entity.get('parent_entity') or 'not declared'}\n"
216
+ f"-- Hash diff fields: {hash_diff_str}"
217
+ )
218
+
219
+ column_lines: List[str] = []
220
+ pk_fields: List[str] = []
221
+ check_constraints: List[str] = []
222
+
223
+ for field in fields:
224
+ if field.get("computed") is True:
225
+ continue
226
+
227
+ field_name = str(field.get("name", ""))
228
+ col_type = _sql_type(str(field.get("type", "string")), dialect)
229
+ nullable = bool(field.get("nullable", True))
230
+ unique = bool(field.get("unique", False))
231
+ primary_key = bool(field.get("primary_key", False))
232
+
233
+ parts = [f'"{field_name}"', col_type]
234
+
235
+ default_val = field.get("default")
236
+ if "default" in field:
237
+ formatted = _format_default(default_val, dialect)
238
+ if formatted is not None:
239
+ parts.append(f"DEFAULT {formatted}")
240
+
241
+ if not nullable:
242
+ parts.append("NOT NULL")
243
+ if unique:
244
+ parts.append("UNIQUE")
245
+ if primary_key:
246
+ pk_fields.append(field_name)
247
+
248
+ column_lines.append(" " + " ".join(parts))
249
+
250
+ check_expr = field.get("check")
251
+ if check_expr:
252
+ constraint_name = f"chk_{_to_snake(entity_name)}_{field_name}"
253
+ check_constraints.append(
254
+ f' CONSTRAINT "{constraint_name}" CHECK ({check_expr})'
255
+ )
256
+
257
+ if pk_fields:
258
+ pk_cols = ", ".join([f'"{col}"' for col in pk_fields])
259
+ column_lines.append(f" PRIMARY KEY ({pk_cols})")
260
+
261
+ column_lines.extend(check_constraints)
262
+
263
+ create_sql = f"CREATE TABLE {qualified} (\n" + ",\n".join(column_lines) + "\n);"
264
+ if dim_header:
265
+ create_sql = dim_header + "\n" + create_sql
266
+ create_blocks.append(create_sql)
267
+
268
+ for rel in relationships:
269
+ from_ref = str(rel.get("from", ""))
270
+ to_ref = str(rel.get("to", ""))
271
+ cardinality = str(rel.get("cardinality", "one_to_many"))
272
+ rel_name = str(rel.get("name", "relationship"))
273
+
274
+ if "." not in from_ref or "." not in to_ref:
275
+ continue
276
+
277
+ from_entity, from_field = from_ref.split(".", 1)
278
+ to_entity, to_field = to_ref.split(".", 1)
279
+
280
+ if cardinality == "one_to_many":
281
+ parent_entity, parent_field = from_entity, from_field
282
+ child_entity, child_field = to_entity, to_field
283
+ elif cardinality == "many_to_one":
284
+ parent_entity, parent_field = to_entity, to_field
285
+ child_entity, child_field = from_entity, from_field
286
+ elif cardinality == "one_to_one":
287
+ parent_entity, parent_field = from_entity, from_field
288
+ child_entity, child_field = to_entity, to_field
289
+ else:
290
+ continue
291
+
292
+ constraint = f"fk_{_to_snake(rel_name)}"
293
+ child_qualified = _qualified_name(entity_map.get(child_entity, {"name": child_entity}), dialect)
294
+ parent_qualified = _qualified_name(entity_map.get(parent_entity, {"name": parent_entity}), dialect)
295
+
296
+ if dialect == "bigquery":
297
+ continue
298
+
299
+ alter_sql = (
300
+ f"ALTER TABLE {child_qualified} "
301
+ f'ADD CONSTRAINT "{constraint}" FOREIGN KEY ("{child_field}") '
302
+ f'REFERENCES {parent_qualified} ("{parent_field}");'
303
+ )
304
+ alter_blocks.append(alter_sql)
305
+
306
+ for idx_def in indexes:
307
+ idx_name = idx_def.get("name", "")
308
+ idx_entity = idx_def.get("entity", "")
309
+ idx_fields = idx_def.get("fields", [])
310
+ idx_unique = idx_def.get("unique", False)
311
+
312
+ entity_obj = entity_map.get(idx_entity, {"name": idx_entity})
313
+ qualified = _qualified_name(entity_obj, dialect)
314
+ cols = ", ".join([f'"{f}"' for f in idx_fields])
315
+ unique_kw = "UNIQUE " if idx_unique else ""
316
+
317
+ if dialect == "bigquery":
318
+ continue
319
+
320
+ index_blocks.append(
321
+ f'CREATE {unique_kw}INDEX "{idx_name}" ON {qualified} ({cols});'
322
+ )
323
+
324
+ blocks = create_blocks + alter_blocks + index_blocks
325
+ return "\n\n".join(blocks) + ("\n" if blocks else "")
326
+
327
+
328
+ def _dbt_source_table_name(entity_name: str) -> str:
329
+ return _to_snake(entity_name)
330
+
331
+
332
+ def dbt_scaffold_files(
333
+ model: Dict[str, Any],
334
+ source_name: str = "raw",
335
+ project_name: str = "data_modeling_mvp",
336
+ ) -> List[Tuple[str, str]]:
337
+ entities = model.get("entities", [])
338
+
339
+ files: List[Tuple[str, str]] = []
340
+ dbt_project = (
341
+ f"name: {project_name}\n"
342
+ "version: 1.0.0\n"
343
+ "config-version: 2\n\n"
344
+ "profile: default\n\n"
345
+ "models:\n"
346
+ f" {project_name}:\n"
347
+ " staging:\n"
348
+ " +materialized: view\n"
349
+ )
350
+ files.append(("dbt_project.yml", dbt_project))
351
+
352
+ schema_lines = ["version: 2", "", "models:"]
353
+
354
+ for entity in entities:
355
+ entity_name = str(entity.get("name", ""))
356
+ entity_type = str(entity.get("type", "table"))
357
+ table_name = _dbt_source_table_name(entity_name)
358
+ # Use dimensional naming conventions for fact/dim/bridge tables
359
+ if entity_type == "fact_table":
360
+ model_name = f"fct_{table_name}"
361
+ elif entity_type == "dimension_table":
362
+ model_name = f"dim_{table_name}"
363
+ elif entity_type == "bridge_table":
364
+ model_name = f"brd_{table_name}"
365
+ elif entity_type == "hub":
366
+ model_name = f"hub_{table_name}"
367
+ elif entity_type == "link":
368
+ model_name = f"lnk_{table_name}"
369
+ elif entity_type == "satellite":
370
+ model_name = f"sat_{table_name}"
371
+ else:
372
+ model_name = f"stg_{table_name}"
373
+ fields = entity.get("fields", [])
374
+
375
+ sql = (
376
+ f"select\n "
377
+ + ",\n ".join([f'"{field.get("name")}"' for field in fields])
378
+ + f"\nfrom {{{{ source('{source_name}', '{table_name}') }}}}\n"
379
+ )
380
+ files.append((f"models/staging/{model_name}.sql", sql))
381
+
382
+ schema_lines.append(f" - name: {model_name}")
383
+ if entity.get("description"):
384
+ schema_lines.append(f" description: \"{entity.get('description')}\"")
385
+ entity_meta: List[str] = []
386
+ if entity.get("tags"):
387
+ entity_meta.append(f" tags: {entity['tags']}")
388
+ if entity.get("owner"):
389
+ entity_meta.append(f" owner: \"{entity['owner']}\"")
390
+ if entity.get("subject_area"):
391
+ entity_meta.append(f" subject_area: \"{entity['subject_area']}\"")
392
+ # Dimensional modeling metadata in dbt meta block
393
+ if entity_type in {"fact_table", "dimension_table", "bridge_table", "hub", "link", "satellite"}:
394
+ entity_meta.append(f" entity_type: \"{entity_type}\"")
395
+ if entity.get("scd_type"):
396
+ entity_meta.append(f" scd_type: {entity['scd_type']}")
397
+ if entity.get("natural_key"):
398
+ entity_meta.append(f" natural_key: \"{entity['natural_key']}\"")
399
+ if entity.get("conformed"):
400
+ entity_meta.append(" conformed: true")
401
+ if entity.get("dimension_refs"):
402
+ entity_meta.append(f" dimension_refs: {entity['dimension_refs']}")
403
+ if entity.get("business_keys"):
404
+ entity_meta.append(f" business_keys: {entity['business_keys']}")
405
+ if entity.get("hash_key"):
406
+ entity_meta.append(f" hash_key: \"{entity['hash_key']}\"")
407
+ if entity.get("link_refs"):
408
+ entity_meta.append(f" link_refs: {entity['link_refs']}")
409
+ if entity.get("parent_entity"):
410
+ entity_meta.append(f" parent_entity: \"{entity['parent_entity']}\"")
411
+ if entity.get("hash_diff_fields"):
412
+ entity_meta.append(f" hash_diff_fields: {entity['hash_diff_fields']}")
413
+ if entity_meta:
414
+ schema_lines.append(" meta:")
415
+ schema_lines.extend(entity_meta)
416
+ schema_lines.append(" columns:")
417
+ for field in fields:
418
+ field_name = str(field.get("name", ""))
419
+ schema_lines.append(f" - name: {field_name}")
420
+ description = str(field.get("description", "")).strip() or f"Field {field_name}"
421
+ schema_lines.append(f" description: \"{description}\"")
422
+ field_meta: List[str] = []
423
+ if field.get("sensitivity"):
424
+ field_meta.append(f" sensitivity: \"{field['sensitivity']}\"")
425
+ if field.get("tags"):
426
+ field_meta.append(f" tags: {field['tags']}")
427
+ if field.get("deprecated"):
428
+ field_meta.append(" deprecated: true")
429
+ if field_meta:
430
+ schema_lines.append(" meta:")
431
+ schema_lines.extend(field_meta)
432
+ tests: List[str] = []
433
+ if field.get("primary_key"):
434
+ tests.extend(["not_null", "unique"])
435
+ elif field.get("nullable") is False:
436
+ tests.append("not_null")
437
+ if tests:
438
+ schema_lines.append(" tests:")
439
+ for test_name in tests:
440
+ schema_lines.append(f" - {test_name}")
441
+
442
+ files.append(("models/staging/schema.yml", "\n".join(schema_lines) + "\n"))
443
+
444
+ source_schema = [
445
+ "version: 2",
446
+ "",
447
+ "sources:",
448
+ f" - name: {source_name}",
449
+ " schema: public",
450
+ " tables:",
451
+ ]
452
+ for entity in entities:
453
+ table_name = _dbt_source_table_name(str(entity.get("name", "")))
454
+ source_schema.append(f" - name: {table_name}")
455
+ files.append(("models/sources.yml", "\n".join(source_schema) + "\n"))
456
+
457
+ return files
458
+
459
+
460
+ def write_dbt_scaffold(
461
+ model: Dict[str, Any],
462
+ out_dir: str,
463
+ source_name: str = "raw",
464
+ project_name: str = "data_modeling_mvp",
465
+ ) -> List[str]:
466
+ root = Path(out_dir)
467
+ root.mkdir(parents=True, exist_ok=True)
468
+
469
+ created: List[str] = []
470
+ for rel_path, content in dbt_scaffold_files(
471
+ model=model, source_name=source_name, project_name=project_name
472
+ ):
473
+ target = root / rel_path
474
+ target.parent.mkdir(parents=True, exist_ok=True)
475
+ target.write_text(content, encoding="utf-8")
476
+ created.append(str(target))
477
+
478
+ return created