datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,1176 @@
1
+ import json
2
+ import re
3
+ from datetime import date
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import yaml
7
+
8
+
9
+ CREATE_TABLE_RE = re.compile(
10
+ r"create\s+table\s+(?:if\s+not\s+exists\s+)?([\w\"\.\.]+)\s*\((.*?)\)\s*;",
11
+ flags=re.IGNORECASE | re.DOTALL,
12
+ )
13
+ CREATE_VIEW_RE = re.compile(
14
+ r"create\s+(?:or\s+replace\s+)?view\s+(?:if\s+not\s+exists\s+)?([\w\"\.\.]+)",
15
+ flags=re.IGNORECASE,
16
+ )
17
+ CREATE_MVIEW_RE = re.compile(
18
+ r"create\s+(?:or\s+replace\s+)?materialized\s+view\s+(?:if\s+not\s+exists\s+)?([\w\"\.\.]+)",
19
+ flags=re.IGNORECASE,
20
+ )
21
+ CREATE_INDEX_RE = re.compile(
22
+ r"create\s+(?:unique\s+)?index\s+(?:if\s+not\s+exists\s+)?([\w\"]+)\s+on\s+([\w\"\.\.]+)\s*\(([^)]+)\)",
23
+ flags=re.IGNORECASE,
24
+ )
25
+ TABLE_RE = re.compile(r"^\s*table\s+([\w\"]+)\s*\{\s*$", flags=re.IGNORECASE)
26
+ REF_RE = re.compile(r"^\s*ref\s*:\s*([\w]+)\.([\w]+)\s*([<>-]+)\s*([\w]+)\.([\w]+)", flags=re.IGNORECASE)
27
+ DBT_REF_RE = re.compile(r"ref\(\s*['\"]([^'\"]+)['\"]\s*\)", flags=re.IGNORECASE)
28
+ DBT_SOURCE_RE = re.compile(
29
+ r"source\(\s*['\"]([^'\"]+)['\"]\s*,\s*['\"]([^'\"]+)['\"]\s*\)",
30
+ flags=re.IGNORECASE,
31
+ )
32
+ DBT_SQL_REF_RE = re.compile(
33
+ r"references\s+([A-Za-z0-9_\"\.]+)\s*\(\s*([A-Za-z0-9_\"]+)\s*\)",
34
+ flags=re.IGNORECASE,
35
+ )
36
+
37
+
38
+ def _to_pascal(name: str) -> str:
39
+ name = name.replace('"', "")
40
+ parts = re.split(r"[^A-Za-z0-9]+", name)
41
+ return "".join(part[:1].upper() + part[1:] for part in parts if part)
42
+
43
+
44
+ def _to_model_name(text: str) -> str:
45
+ cleaned = re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_")
46
+ cleaned = cleaned.lower()
47
+ return cleaned or "imported_model"
48
+
49
+
50
+ def _to_snake(name: str) -> str:
51
+ text = re.sub(r"[^A-Za-z0-9]+", "_", str(name or "").strip())
52
+ text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text)
53
+ text = re.sub(r"__+", "_", text).strip("_").lower()
54
+ if not text:
55
+ return ""
56
+ if text[0].isdigit():
57
+ text = f"f_{text}"
58
+ return text
59
+
60
+
61
+ def _split_top_level(body: str) -> List[str]:
62
+ parts: List[str] = []
63
+ current: List[str] = []
64
+ depth = 0
65
+ in_single = False
66
+ in_double = False
67
+
68
+ for char in body:
69
+ if char == "'" and not in_double:
70
+ in_single = not in_single
71
+ elif char == '"' and not in_single:
72
+ in_double = not in_double
73
+ elif not in_single and not in_double:
74
+ if char == "(":
75
+ depth += 1
76
+ elif char == ")":
77
+ depth = max(0, depth - 1)
78
+ elif char == "," and depth == 0:
79
+ parts.append("".join(current).strip())
80
+ current = []
81
+ continue
82
+ current.append(char)
83
+
84
+ if current:
85
+ parts.append("".join(current).strip())
86
+ return [part for part in parts if part]
87
+
88
+
89
+ def _default_model(model_name: str, domain: str, owners: List[str]) -> Dict[str, Any]:
90
+ return {
91
+ "model": {
92
+ "name": _to_model_name(model_name),
93
+ "kind": "physical",
94
+ "spec_version": 3,
95
+ "version": "1.0.0",
96
+ "domain": domain,
97
+ "owners": owners,
98
+ "state": "draft",
99
+ },
100
+ "entities": [],
101
+ "relationships": [],
102
+ "governance": {"classification": {}, "stewards": {}},
103
+ "rules": [],
104
+ }
105
+
106
+
107
+ def _parse_default_value(rest: str) -> Optional[str]:
108
+ """Extract DEFAULT value from column definition tail."""
109
+ m = re.search(r"default\s+('(?:[^']*)'|\S+)", rest, re.IGNORECASE)
110
+ if m:
111
+ val = m.group(1).strip("'")
112
+ return val
113
+ return None
114
+
115
+
116
+ def _parse_check_constraint(rest: str) -> Optional[str]:
117
+ """Extract CHECK constraint expression from column definition tail."""
118
+ m = re.search(r"check\s*\((.+?)\)", rest, re.IGNORECASE)
119
+ if m:
120
+ return m.group(1).strip()
121
+ return None
122
+
123
+
124
+ def import_sql_ddl(
125
+ ddl_text: str,
126
+ model_name: str = "imported_sql_model",
127
+ domain: str = "imported",
128
+ owners: List[str] = None,
129
+ ) -> Dict[str, Any]:
130
+ owners = owners or ["data-team@example.com"]
131
+ model = _default_model(model_name=model_name, domain=domain, owners=owners)
132
+
133
+ entity_fields: Dict[str, List[Dict[str, Any]]] = {}
134
+ entity_meta: Dict[str, Dict[str, Any]] = {}
135
+ primary_keys: Dict[str, List[str]] = {}
136
+ relationships: List[Dict[str, Any]] = []
137
+ indexes: List[Dict[str, Any]] = []
138
+
139
+ # --- Parse CREATE TABLE ---
140
+ for match in CREATE_TABLE_RE.finditer(ddl_text):
141
+ table_token = match.group(1).strip()
142
+ schema_name = ""
143
+ parts = table_token.replace('"', '').split(".")
144
+ if len(parts) >= 2:
145
+ schema_name = parts[-2]
146
+ table_raw = parts[-1]
147
+ entity_name = _to_pascal(table_raw)
148
+ entity_fields.setdefault(entity_name, [])
149
+ primary_keys.setdefault(entity_name, [])
150
+ if schema_name:
151
+ entity_meta.setdefault(entity_name, {})["schema"] = schema_name
152
+
153
+ body = match.group(2)
154
+ for definition in _split_top_level(body):
155
+ lowered = definition.lower()
156
+ if lowered.startswith("primary key"):
157
+ cols_match = re.search(r"\((.*?)\)", definition)
158
+ if cols_match:
159
+ cols = [col.strip().replace('"', "") for col in cols_match.group(1).split(",")]
160
+ primary_keys[entity_name].extend(cols)
161
+ continue
162
+
163
+ if lowered.startswith("foreign key"):
164
+ fk_match = re.search(
165
+ r"foreign\s+key\s*\((.*?)\)\s+references\s+([\w\"\.\.]+)\s*\((.*?)\)",
166
+ definition,
167
+ flags=re.IGNORECASE,
168
+ )
169
+ if fk_match:
170
+ local_field = fk_match.group(1).strip().replace('"', "")
171
+ ref_table = fk_match.group(2).strip().split(".")[-1].replace('"', "")
172
+ ref_field = fk_match.group(3).strip().replace('"', "")
173
+ parent_entity = _to_pascal(ref_table)
174
+ child_entity = entity_name
175
+ relationships.append(
176
+ {
177
+ "name": f"{parent_entity.lower()}_{child_entity.lower()}_{local_field}_fk",
178
+ "from": f"{parent_entity}.{ref_field}",
179
+ "to": f"{child_entity}.{local_field}",
180
+ "cardinality": "one_to_many",
181
+ }
182
+ )
183
+ continue
184
+
185
+ # Table-level CHECK constraint
186
+ if lowered.startswith("check") or (lowered.startswith("constraint") and "check" in lowered):
187
+ continue
188
+
189
+ col_match = re.match(r"^\s*\"?([A-Za-z_][A-Za-z0-9_]*)\"?\s+([^\s,]+(?:\([^)]*\))?)(.*)$", definition)
190
+ if not col_match:
191
+ continue
192
+
193
+ col_name = col_match.group(1)
194
+ col_type = col_match.group(2)
195
+ rest = col_match.group(3)
196
+ rest_lower = rest.lower()
197
+
198
+ field: Dict[str, Any] = {
199
+ "name": col_name,
200
+ "type": col_type.lower(),
201
+ "nullable": "not null" not in rest_lower,
202
+ }
203
+ if "primary key" in rest_lower:
204
+ field["primary_key"] = True
205
+ if "unique" in rest_lower:
206
+ field["unique"] = True
207
+
208
+ default_val = _parse_default_value(rest)
209
+ if default_val is not None:
210
+ field["default"] = default_val
211
+
212
+ check_expr = _parse_check_constraint(rest)
213
+ if check_expr:
214
+ field["check"] = check_expr
215
+
216
+ ref_match = re.search(
217
+ r"references\s+([\w\"\.\.]+)\s*\((.*?)\)",
218
+ rest,
219
+ flags=re.IGNORECASE,
220
+ )
221
+ if ref_match:
222
+ ref_table = ref_match.group(1).strip().split(".")[-1].replace('"', "")
223
+ ref_field = ref_match.group(2).strip().replace('"', "")
224
+ parent_entity = _to_pascal(ref_table)
225
+ child_entity = entity_name
226
+ field["foreign_key"] = True
227
+ relationships.append(
228
+ {
229
+ "name": f"{parent_entity.lower()}_{child_entity.lower()}_{col_name}_fk",
230
+ "from": f"{parent_entity}.{ref_field}",
231
+ "to": f"{child_entity}.{col_name}",
232
+ "cardinality": "one_to_many",
233
+ }
234
+ )
235
+
236
+ entity_fields[entity_name].append(field)
237
+
238
+ # --- Parse CREATE VIEW / CREATE MATERIALIZED VIEW ---
239
+ for m in CREATE_MVIEW_RE.finditer(ddl_text):
240
+ view_token = m.group(1).strip().replace('"', '').split(".")[-1]
241
+ ename = _to_pascal(view_token)
242
+ if ename not in entity_fields:
243
+ entity_fields[ename] = []
244
+ entity_meta.setdefault(ename, {})["type"] = "materialized_view"
245
+
246
+ for m in CREATE_VIEW_RE.finditer(ddl_text):
247
+ view_token = m.group(1).strip().replace('"', '').split(".")[-1]
248
+ ename = _to_pascal(view_token)
249
+ # Don't overwrite materialized_view
250
+ if ename not in entity_fields:
251
+ entity_fields[ename] = []
252
+ entity_meta.setdefault(ename, {})["type"] = "view"
253
+
254
+ # --- Parse CREATE INDEX ---
255
+ for m in CREATE_INDEX_RE.finditer(ddl_text):
256
+ idx_name = m.group(1).strip().replace('"', '')
257
+ idx_table = m.group(2).strip().replace('"', '').split(".")[-1]
258
+ idx_cols = [c.strip().replace('"', '') for c in m.group(3).split(",")]
259
+ # Check for UNIQUE by looking at the full matched statement prefix
260
+ stmt_prefix = ddl_text[max(0, m.start()-50):m.start() + 30].lower()
261
+ is_unique = bool(re.search(r"create\s+unique\s+index", stmt_prefix, re.IGNORECASE))
262
+ idx_entity = _to_pascal(idx_table)
263
+ indexes.append({
264
+ "name": idx_name,
265
+ "entity": idx_entity,
266
+ "fields": idx_cols,
267
+ "unique": is_unique,
268
+ })
269
+
270
+ # --- Build entities ---
271
+ for entity_name, fields in sorted(entity_fields.items()):
272
+ pk_set = {value for value in primary_keys.get(entity_name, []) if value}
273
+ for field in fields:
274
+ if field["name"] in pk_set:
275
+ field["primary_key"] = True
276
+ field["nullable"] = False
277
+
278
+ meta = entity_meta.get(entity_name, {})
279
+ entity: Dict[str, Any] = {
280
+ "name": entity_name,
281
+ "type": meta.get("type", "table"),
282
+ "description": f"Imported from SQL on {date.today().isoformat()}",
283
+ "fields": fields,
284
+ }
285
+ if meta.get("schema"):
286
+ entity["schema"] = meta["schema"]
287
+ model["entities"].append(entity)
288
+
289
+ deduped: Dict[Tuple[str, str, str, str], Dict[str, str]] = {}
290
+ for rel in relationships:
291
+ key = (rel["name"], rel["from"], rel["to"], rel["cardinality"])
292
+ deduped[key] = rel
293
+ model["relationships"] = sorted(deduped.values(), key=lambda x: x["name"])
294
+
295
+ if indexes:
296
+ model["indexes"] = indexes
297
+
298
+ return model
299
+
300
+
301
+ def import_dbml(
302
+ dbml_text: str,
303
+ model_name: str = "imported_dbml_model",
304
+ domain: str = "imported",
305
+ owners: List[str] = None,
306
+ ) -> Dict[str, Any]:
307
+ owners = owners or ["data-team@example.com"]
308
+ model = _default_model(model_name=model_name, domain=domain, owners=owners)
309
+
310
+ entities: Dict[str, Dict[str, Any]] = {}
311
+ current_entity: str = ""
312
+
313
+ for raw_line in dbml_text.splitlines():
314
+ line = raw_line.strip()
315
+ if not line or line.startswith("//"):
316
+ continue
317
+
318
+ table_match = TABLE_RE.match(line)
319
+ if table_match:
320
+ table_name = table_match.group(1).replace('"', "")
321
+ current_entity = _to_pascal(table_name)
322
+ entities[current_entity] = {
323
+ "name": current_entity,
324
+ "type": "table",
325
+ "description": f"Imported from DBML on {date.today().isoformat()}",
326
+ "fields": [],
327
+ }
328
+ continue
329
+
330
+ if line == "}":
331
+ current_entity = ""
332
+ continue
333
+
334
+ ref_match = REF_RE.match(line)
335
+ if ref_match:
336
+ left_table = _to_pascal(ref_match.group(1))
337
+ left_field = ref_match.group(2)
338
+ direction = ref_match.group(3)
339
+ right_table = _to_pascal(ref_match.group(4))
340
+ right_field = ref_match.group(5)
341
+
342
+ if ">" in direction:
343
+ parent_table, parent_field = right_table, right_field
344
+ child_table, child_field = left_table, left_field
345
+ else:
346
+ parent_table, parent_field = left_table, left_field
347
+ child_table, child_field = right_table, right_field
348
+
349
+ model["relationships"].append(
350
+ {
351
+ "name": f"{parent_table.lower()}_{child_table.lower()}_{child_field}_fk",
352
+ "from": f"{parent_table}.{parent_field}",
353
+ "to": f"{child_table}.{child_field}",
354
+ "cardinality": "one_to_many",
355
+ }
356
+ )
357
+ continue
358
+
359
+ if current_entity:
360
+ # Example: user_id integer [pk, not null, unique]
361
+ field_match = re.match(
362
+ r"^([A-Za-z_][A-Za-z0-9_]*)\s+([^\s\[]+)(?:\s*\[(.*?)\])?$",
363
+ line,
364
+ )
365
+ if not field_match:
366
+ continue
367
+
368
+ field_name = field_match.group(1)
369
+ field_type = field_match.group(2).lower()
370
+ attrs = (field_match.group(3) or "").lower()
371
+
372
+ field = {
373
+ "name": field_name,
374
+ "type": field_type,
375
+ "nullable": "not null" not in attrs,
376
+ }
377
+ if "pk" in attrs:
378
+ field["primary_key"] = True
379
+ field["nullable"] = False
380
+ if "unique" in attrs:
381
+ field["unique"] = True
382
+ entities[current_entity]["fields"].append(field)
383
+
384
+ model["entities"] = sorted(entities.values(), key=lambda x: x["name"])
385
+
386
+ deduped: Dict[Tuple[str, str, str, str], Dict[str, str]] = {}
387
+ for rel in model["relationships"]:
388
+ key = (rel["name"], rel["from"], rel["to"], rel["cardinality"])
389
+ deduped[key] = rel
390
+ model["relationships"] = sorted(deduped.values(), key=lambda x: x["name"])
391
+
392
+ return model
393
+
394
+
395
+ # ---------------------------------------------------------------------------
396
+ # Spark schema importer (JSON struct type files)
397
+ # ---------------------------------------------------------------------------
398
+
399
+ _SPARK_TYPE_MAP = {
400
+ "string": "string",
401
+ "integer": "integer",
402
+ "int": "integer",
403
+ "long": "bigint",
404
+ "bigint": "bigint",
405
+ "short": "smallint",
406
+ "smallint": "smallint",
407
+ "byte": "tinyint",
408
+ "tinyint": "tinyint",
409
+ "float": "float",
410
+ "double": "float",
411
+ "boolean": "boolean",
412
+ "binary": "binary",
413
+ "date": "date",
414
+ "timestamp": "timestamp",
415
+ "timestamp_ntz": "timestamp",
416
+ "void": "string",
417
+ }
418
+
419
+
420
+ def _spark_field_type(spark_type: Any) -> str:
421
+ """Map a Spark schema type to a DataLex field type."""
422
+ if isinstance(spark_type, str):
423
+ lower = spark_type.lower()
424
+ if lower.startswith("decimal"):
425
+ return lower
426
+ if lower.startswith("varchar") or lower.startswith("char"):
427
+ return "string"
428
+ if lower.startswith("array") or lower.startswith("map") or lower.startswith("struct"):
429
+ return "json"
430
+ return _SPARK_TYPE_MAP.get(lower, "string")
431
+ if isinstance(spark_type, dict):
432
+ type_name = spark_type.get("type", "string")
433
+ if isinstance(type_name, str):
434
+ lower = type_name.lower()
435
+ if lower == "struct":
436
+ return "json"
437
+ if lower == "array":
438
+ return "json"
439
+ if lower == "map":
440
+ return "json"
441
+ if lower == "udt":
442
+ return "json"
443
+ return _SPARK_TYPE_MAP.get(lower, "string")
444
+ return "json"
445
+ return "string"
446
+
447
+
448
+ def import_spark_schema(
449
+ schema_text: str,
450
+ model_name: str = "imported_spark_schema",
451
+ domain: str = "imported",
452
+ owners: List[str] = None,
453
+ table_name: Optional[str] = None,
454
+ ) -> Dict[str, Any]:
455
+ """Import a Spark schema JSON file into a DataLex model.
456
+
457
+ Supports:
458
+ - Single StructType schema (from df.schema.json() or DESCRIBE TABLE output)
459
+ - Array of named table schemas [{name: "...", schema: {...}}, ...]
460
+ - Databricks catalog export format with table_name + columns
461
+ """
462
+ owners = owners or ["data-team@example.com"]
463
+ model = _default_model(model_name=model_name, domain=domain, owners=owners)
464
+
465
+ schema = json.loads(schema_text)
466
+
467
+ tables_to_process: List[Tuple[str, Dict[str, Any]]] = []
468
+
469
+ if isinstance(schema, list):
470
+ # Array of table schemas
471
+ for idx, item in enumerate(schema):
472
+ if isinstance(item, dict):
473
+ name = item.get("name") or item.get("table_name") or f"table_{idx}"
474
+ inner = item.get("schema") or item.get("columns") or item
475
+ tables_to_process.append((name, inner))
476
+ elif isinstance(schema, dict):
477
+ if schema.get("type") == "struct" and "fields" in schema:
478
+ # Single StructType
479
+ name = table_name or model_name
480
+ tables_to_process.append((name, schema))
481
+ elif "columns" in schema:
482
+ # Databricks-style: {table_name: "...", columns: [...]}
483
+ name = schema.get("table_name") or schema.get("name") or table_name or model_name
484
+ tables_to_process.append((name, schema))
485
+ elif "fields" in schema:
486
+ name = table_name or model_name
487
+ tables_to_process.append((name, schema))
488
+
489
+ for tbl_name, tbl_schema in tables_to_process:
490
+ entity_name = _to_pascal(tbl_name)
491
+
492
+ # Extract fields from StructType or columns array
493
+ raw_fields = []
494
+ if isinstance(tbl_schema, dict):
495
+ if "fields" in tbl_schema:
496
+ raw_fields = tbl_schema["fields"]
497
+ elif "columns" in tbl_schema:
498
+ raw_fields = tbl_schema["columns"]
499
+ elif isinstance(tbl_schema, list):
500
+ raw_fields = tbl_schema
501
+
502
+ fields: List[Dict[str, Any]] = []
503
+ for raw_field in raw_fields:
504
+ if not isinstance(raw_field, dict):
505
+ continue
506
+
507
+ fname = raw_field.get("name", "")
508
+ if not fname:
509
+ continue
510
+
511
+ ftype_raw = raw_field.get("type", raw_field.get("data_type", "string"))
512
+ ftype = _spark_field_type(ftype_raw)
513
+ nullable = raw_field.get("nullable", True)
514
+
515
+ field: Dict[str, Any] = {
516
+ "name": fname,
517
+ "type": ftype,
518
+ "nullable": bool(nullable),
519
+ }
520
+
521
+ metadata = raw_field.get("metadata", {})
522
+ if isinstance(metadata, dict):
523
+ if metadata.get("comment"):
524
+ field["description"] = metadata["comment"]
525
+ if metadata.get("sensitivity"):
526
+ field["sensitivity"] = metadata["sensitivity"]
527
+
528
+ if raw_field.get("comment"):
529
+ field["description"] = raw_field["comment"]
530
+
531
+ fields.append(field)
532
+
533
+ entity: Dict[str, Any] = {
534
+ "name": entity_name,
535
+ "type": "table",
536
+ "description": f"Imported from Spark schema on {date.today().isoformat()}",
537
+ "fields": fields,
538
+ }
539
+ model["entities"].append(entity)
540
+
541
+ return model
542
+
543
+
544
+ # ---------------------------------------------------------------------------
545
+ # dbt schema.yml importer
546
+ # ---------------------------------------------------------------------------
547
+
548
+ def _dbt_parse_to_entity(to_expr: Any) -> Optional[str]:
549
+ if not isinstance(to_expr, str):
550
+ return None
551
+ text = to_expr.strip()
552
+ if not text:
553
+ return None
554
+
555
+ ref_match = DBT_REF_RE.search(text)
556
+ if ref_match:
557
+ return _to_pascal(ref_match.group(1))
558
+
559
+ source_match = DBT_SOURCE_RE.search(text)
560
+ if source_match:
561
+ return _to_pascal(source_match.group(2))
562
+
563
+ token = text.split(".")[-1].strip().strip("'\"")
564
+ return _to_pascal(token) if token else None
565
+
566
+
567
+ def _as_test_list(tests: Any) -> List[Any]:
568
+ if tests is None:
569
+ return []
570
+ if isinstance(tests, list):
571
+ return tests
572
+ return [tests]
573
+
574
+
575
+ def _as_constraint_list(constraints: Any) -> List[Any]:
576
+ if constraints is None:
577
+ return []
578
+ if isinstance(constraints, list):
579
+ return constraints
580
+ return [constraints]
581
+
582
+
583
+ def _dbt_constraint_target(constraint: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
584
+ to_expr = constraint.get("to") or constraint.get("references")
585
+ target_entity = _dbt_parse_to_entity(to_expr)
586
+ target_field = _to_snake(str(constraint.get("field") or "").strip())
587
+ if target_entity and target_field:
588
+ return target_entity, target_field
589
+
590
+ expr = str(constraint.get("expression") or constraint.get("references") or "").strip()
591
+ if not expr:
592
+ return None, None
593
+ m = DBT_SQL_REF_RE.search(expr)
594
+ if not m:
595
+ return None, None
596
+ entity_token = m.group(1).replace('"', "").split(".")[-1]
597
+ field_token = _to_snake(m.group(2).replace('"', "").strip())
598
+ return (_to_pascal(entity_token) if entity_token else None), (field_token or None)
599
+
600
+
601
+ def _ensure_field(entity: Dict[str, Any], field_name: str) -> None:
602
+ field_name = _to_snake(field_name)
603
+ if not field_name:
604
+ return
605
+ fields = entity.setdefault("fields", [])
606
+ if any(str(f.get("name", "")) == field_name for f in fields):
607
+ return
608
+ fields.append(
609
+ {
610
+ "name": field_name,
611
+ "type": "string",
612
+ "nullable": True,
613
+ "description": "Inferred from dbt relationships test",
614
+ }
615
+ )
616
+
617
+
618
+ def _upsert_field(entity: Dict[str, Any], field: Dict[str, Any]) -> None:
619
+ fields = entity.setdefault("fields", [])
620
+ name = str(field.get("name", ""))
621
+ if not name:
622
+ return
623
+ existing = next((f for f in fields if str(f.get("name", "")) == name), None)
624
+ if existing is None:
625
+ fields.append(field)
626
+ return
627
+
628
+ if field.get("type") and (not existing.get("type") or existing.get("type") == "string"):
629
+ existing["type"] = field["type"]
630
+ if field.get("description") and not existing.get("description"):
631
+ existing["description"] = field["description"]
632
+ if field.get("nullable") is False:
633
+ existing["nullable"] = False
634
+ if field.get("unique"):
635
+ existing["unique"] = True
636
+ if field.get("primary_key"):
637
+ existing["primary_key"] = True
638
+ if field.get("foreign_key"):
639
+ existing["foreign_key"] = True
640
+
641
+
642
+ def _ensure_non_empty_fields(entity: Dict[str, Any]) -> None:
643
+ fields = entity.setdefault("fields", [])
644
+ if fields:
645
+ return
646
+ fields.append(
647
+ {
648
+ "name": "row_id",
649
+ "type": "string",
650
+ "nullable": True,
651
+ "description": "Placeholder field inferred because dbt schema did not define columns.",
652
+ }
653
+ )
654
+
655
+
656
+ def _build_placeholder_entity() -> Dict[str, Any]:
657
+ return {
658
+ "name": "DbtSchemaInfo",
659
+ "type": "view",
660
+ "description": "Placeholder entity generated because dbt schema did not define importable models/sources.",
661
+ "fields": [
662
+ {
663
+ "name": "row_id",
664
+ "type": "string",
665
+ "nullable": True,
666
+ "description": "Placeholder field inferred because dbt schema did not define columns.",
667
+ }
668
+ ],
669
+ }
670
+
671
+
672
+ def _semantic_dimension_field_type(dim_type: str) -> str:
673
+ d = str(dim_type or "").strip().lower()
674
+ if d == "time":
675
+ return "date"
676
+ return "string"
677
+
678
+
679
+ def _semantic_measure_field_type(agg: str) -> str:
680
+ a = str(agg or "").strip().lower()
681
+ if a in {"count", "count_distinct"}:
682
+ return "bigint"
683
+ return "decimal(18,2)"
684
+
685
+
686
+ def import_dbt_schema_yml(
687
+ schema_yml_text: str,
688
+ model_name: str = "imported_dbt_model",
689
+ domain: str = "imported",
690
+ owners: List[str] = None,
691
+ ) -> Dict[str, Any]:
692
+ owners = owners or ["data-team@example.com"]
693
+ model = _default_model(model_name=model_name, domain=domain, owners=owners)
694
+
695
+ loaded = yaml.safe_load(schema_yml_text) or {}
696
+ if not isinstance(loaded, dict):
697
+ return model
698
+
699
+ entities_by_name: Dict[str, Dict[str, Any]] = {}
700
+ relationship_candidates: List[Dict[str, str]] = []
701
+
702
+ def get_or_create_entity(
703
+ raw_name: str,
704
+ entity_type: str,
705
+ description: str = "",
706
+ tags: Optional[List[str]] = None,
707
+ schema_name: str = "",
708
+ subject_area: str = "",
709
+ ) -> Dict[str, Any]:
710
+ entity_name = _to_pascal(raw_name)
711
+ if entity_name in entities_by_name:
712
+ existing = entities_by_name[entity_name]
713
+ if description and not existing.get("description"):
714
+ existing["description"] = description
715
+ if schema_name and not existing.get("schema"):
716
+ existing["schema"] = schema_name
717
+ if subject_area and not existing.get("subject_area"):
718
+ existing["subject_area"] = subject_area
719
+ if tags:
720
+ merged = set(existing.get("tags", []))
721
+ merged.update(str(t) for t in tags if t)
722
+ existing["tags"] = sorted(merged)
723
+ return existing
724
+
725
+ entity: Dict[str, Any] = {
726
+ "name": entity_name,
727
+ "type": entity_type,
728
+ "description": description or f"Imported from dbt schema.yml on {date.today().isoformat()}",
729
+ "fields": [],
730
+ }
731
+ if schema_name:
732
+ entity["schema"] = schema_name
733
+ if subject_area:
734
+ entity["subject_area"] = subject_area
735
+ if tags:
736
+ entity["tags"] = sorted({str(t) for t in tags if t})
737
+ entities_by_name[entity_name] = entity
738
+ return entity
739
+
740
+ def process_columns(columns: Any, entity: Dict[str, Any]) -> None:
741
+ if not isinstance(columns, list):
742
+ return
743
+ for col in columns:
744
+ if not isinstance(col, dict):
745
+ continue
746
+ col_name = _to_snake(str(col.get("name", "")).strip())
747
+ if not col_name:
748
+ continue
749
+
750
+ field: Dict[str, Any] = {
751
+ "name": col_name,
752
+ "type": str(col.get("data_type") or col.get("type") or "string"),
753
+ "nullable": True,
754
+ }
755
+ if col.get("description"):
756
+ field["description"] = str(col["description"])
757
+
758
+ tests = _as_test_list(col.get("tests")) + _as_test_list(col.get("data_tests"))
759
+ has_not_null = False
760
+ has_unique = False
761
+ has_fk = False
762
+
763
+ for test_def in tests:
764
+ if isinstance(test_def, str):
765
+ tname = test_def.split(".")[-1].lower()
766
+ if tname == "not_null":
767
+ has_not_null = True
768
+ elif tname == "unique":
769
+ has_unique = True
770
+ continue
771
+
772
+ if not isinstance(test_def, dict):
773
+ continue
774
+
775
+ for test_name, test_cfg in test_def.items():
776
+ tname = str(test_name).split(".")[-1].lower()
777
+ if tname == "not_null":
778
+ has_not_null = True
779
+ elif tname == "unique":
780
+ has_unique = True
781
+ elif tname == "relationships":
782
+ cfg = test_cfg if isinstance(test_cfg, dict) else {}
783
+ target_entity = _dbt_parse_to_entity(cfg.get("to"))
784
+ target_field = _to_snake(str(cfg.get("field") or "").strip())
785
+ if target_entity and target_field:
786
+ relationship_candidates.append(
787
+ {
788
+ "parent_entity": target_entity,
789
+ "parent_field": target_field,
790
+ "child_entity": str(entity.get("name", "")),
791
+ "child_field": col_name,
792
+ }
793
+ )
794
+ has_fk = True
795
+
796
+ for constraint_def in _as_constraint_list(col.get("constraints")):
797
+ if isinstance(constraint_def, str):
798
+ cname = constraint_def.lower().strip().replace(" ", "_")
799
+ if cname == "not_null":
800
+ has_not_null = True
801
+ elif cname == "unique":
802
+ has_unique = True
803
+ elif cname == "primary_key":
804
+ has_not_null = True
805
+ has_unique = True
806
+ continue
807
+
808
+ if not isinstance(constraint_def, dict):
809
+ continue
810
+
811
+ ctype = str(constraint_def.get("type") or constraint_def.get("constraint_type") or "").lower().strip().replace(" ", "_")
812
+ if ctype == "not_null":
813
+ has_not_null = True
814
+ elif ctype == "unique":
815
+ has_unique = True
816
+ elif ctype == "primary_key":
817
+ has_not_null = True
818
+ has_unique = True
819
+ elif ctype == "foreign_key":
820
+ has_fk = True
821
+ target_entity, target_field = _dbt_constraint_target(constraint_def)
822
+ if target_entity and target_field:
823
+ relationship_candidates.append(
824
+ {
825
+ "parent_entity": target_entity,
826
+ "parent_field": target_field,
827
+ "child_entity": str(entity.get("name", "")),
828
+ "child_field": col_name,
829
+ }
830
+ )
831
+
832
+ if has_not_null:
833
+ field["nullable"] = False
834
+ if has_unique:
835
+ field["unique"] = True
836
+ if has_unique and has_not_null:
837
+ field["primary_key"] = True
838
+ if has_fk:
839
+ field["foreign_key"] = True
840
+
841
+ _upsert_field(entity, field)
842
+
843
+ # dbt sources -> external tables
844
+ for source in loaded.get("sources", []) if isinstance(loaded.get("sources"), list) else []:
845
+ if not isinstance(source, dict):
846
+ continue
847
+ source_name = str(source.get("name", "")).strip()
848
+ source_schema = str(source.get("schema", "")).strip()
849
+ source_tags = source.get("tags") if isinstance(source.get("tags"), list) else []
850
+ for table in source.get("tables", []) if isinstance(source.get("tables"), list) else []:
851
+ if not isinstance(table, dict):
852
+ continue
853
+ table_name = str(table.get("name", "")).strip()
854
+ if not table_name:
855
+ continue
856
+ table_tags = table.get("tags") if isinstance(table.get("tags"), list) else []
857
+ entity = get_or_create_entity(
858
+ raw_name=table_name,
859
+ entity_type="external_table",
860
+ description=str(table.get("description", "")).strip(),
861
+ tags=[*source_tags, *table_tags],
862
+ schema_name=source_schema,
863
+ subject_area=source_name,
864
+ )
865
+ process_columns(table.get("columns"), entity)
866
+
867
+ # dbt models -> views (safe default)
868
+ for dbt_model in loaded.get("models", []) if isinstance(loaded.get("models"), list) else []:
869
+ if not isinstance(dbt_model, dict):
870
+ continue
871
+ model_raw_name = str(dbt_model.get("name", "")).strip()
872
+ if not model_raw_name:
873
+ continue
874
+ dbt_tags = dbt_model.get("tags") if isinstance(dbt_model.get("tags"), list) else []
875
+ dbt_meta = dbt_model.get("meta") if isinstance(dbt_model.get("meta"), dict) else {}
876
+ entity = get_or_create_entity(
877
+ raw_name=model_raw_name,
878
+ entity_type="view",
879
+ description=str(dbt_model.get("description", "")).strip(),
880
+ tags=dbt_tags,
881
+ schema_name=str(dbt_model.get("schema", "")).strip(),
882
+ subject_area=str(dbt_meta.get("subject_area", "")).strip(),
883
+ )
884
+ process_columns(dbt_model.get("columns"), entity)
885
+
886
+ for constraint_def in _as_constraint_list(dbt_model.get("constraints")):
887
+ if not isinstance(constraint_def, dict):
888
+ continue
889
+ ctype = str(constraint_def.get("type") or constraint_def.get("constraint_type") or "").lower().strip().replace(" ", "_")
890
+ cols = constraint_def.get("columns")
891
+ if not isinstance(cols, list):
892
+ continue
893
+ col_names = [_to_snake(str(c).strip()) for c in cols if str(c).strip()]
894
+ if not col_names:
895
+ continue
896
+
897
+ if ctype == "primary_key":
898
+ for cname in col_names:
899
+ _ensure_field(entity, cname)
900
+ for fld in entity.get("fields", []):
901
+ if str(fld.get("name", "")) == cname:
902
+ fld["primary_key"] = True
903
+ fld["nullable"] = False
904
+ fld["unique"] = True
905
+ elif ctype == "foreign_key":
906
+ target_entity, target_field = _dbt_constraint_target(constraint_def)
907
+ for cname in col_names:
908
+ _ensure_field(entity, cname)
909
+ for fld in entity.get("fields", []):
910
+ if str(fld.get("name", "")) == cname:
911
+ fld["foreign_key"] = True
912
+ if target_entity and target_field:
913
+ relationship_candidates.append(
914
+ {
915
+ "parent_entity": target_entity,
916
+ "parent_field": target_field,
917
+ "child_entity": str(entity.get("name", "")),
918
+ "child_field": cname,
919
+ }
920
+ )
921
+
922
+ # dbt semantic models -> views with semantic keys/dimensions/measures
923
+ for semantic_model in loaded.get("semantic_models", []) if isinstance(loaded.get("semantic_models"), list) else []:
924
+ if not isinstance(semantic_model, dict):
925
+ continue
926
+ sm_name = str(semantic_model.get("name", "")).strip()
927
+ if not sm_name:
928
+ continue
929
+ sm_tags = semantic_model.get("tags") if isinstance(semantic_model.get("tags"), list) else []
930
+ entity = get_or_create_entity(
931
+ raw_name=sm_name,
932
+ entity_type="view",
933
+ description=str(semantic_model.get("description", "")).strip(),
934
+ tags=[*sm_tags, "SEMANTIC_MODEL"],
935
+ )
936
+
937
+ for sem_entity in semantic_model.get("entities", []) if isinstance(semantic_model.get("entities"), list) else []:
938
+ if not isinstance(sem_entity, dict):
939
+ continue
940
+ field_name = _to_snake(str(sem_entity.get("expr") or sem_entity.get("name") or "").strip())
941
+ if not field_name:
942
+ continue
943
+ role = str(sem_entity.get("type") or "").strip().lower()
944
+ field: Dict[str, Any] = {
945
+ "name": field_name,
946
+ "type": "string",
947
+ "nullable": role != "primary",
948
+ "description": f"Semantic entity key ({role or 'entity'}).",
949
+ }
950
+ if role == "primary":
951
+ field["primary_key"] = True
952
+ elif role == "foreign":
953
+ field["foreign_key"] = True
954
+ _upsert_field(entity, field)
955
+
956
+ for dim in semantic_model.get("dimensions", []) if isinstance(semantic_model.get("dimensions"), list) else []:
957
+ if not isinstance(dim, dict):
958
+ continue
959
+ field_name = _to_snake(str(dim.get("expr") or dim.get("name") or "").strip())
960
+ if not field_name:
961
+ continue
962
+ dim_type = str(dim.get("type") or "").strip()
963
+ field: Dict[str, Any] = {
964
+ "name": field_name,
965
+ "type": _semantic_dimension_field_type(dim_type),
966
+ "nullable": True,
967
+ "description": str(dim.get("description") or f"Semantic dimension ({dim_type or 'dimension'})."),
968
+ }
969
+ _upsert_field(entity, field)
970
+
971
+ for measure in semantic_model.get("measures", []) if isinstance(semantic_model.get("measures"), list) else []:
972
+ if not isinstance(measure, dict):
973
+ continue
974
+ field_name = _to_snake(str(measure.get("expr") or measure.get("name") or "").strip())
975
+ if not field_name:
976
+ continue
977
+ agg = str(measure.get("agg") or "").strip()
978
+ field: Dict[str, Any] = {
979
+ "name": field_name,
980
+ "type": _semantic_measure_field_type(agg),
981
+ "nullable": True,
982
+ "description": str(measure.get("description") or f"Semantic measure ({agg or 'measure'})."),
983
+ }
984
+ _upsert_field(entity, field)
985
+
986
+ # dbt metrics -> compact catalog entity
987
+ metrics = loaded.get("metrics", []) if isinstance(loaded.get("metrics"), list) else []
988
+ if metrics:
989
+ metric_entity = get_or_create_entity(
990
+ raw_name="metric_catalog",
991
+ entity_type="view",
992
+ description="dbt metric definitions imported from semantic layer.",
993
+ tags=["METRIC"],
994
+ )
995
+ for metric in metrics:
996
+ if not isinstance(metric, dict):
997
+ continue
998
+ metric_name = _to_snake(str(metric.get("name", "")).strip())
999
+ if not metric_name:
1000
+ continue
1001
+ metric_type = str(metric.get("type") or "").strip()
1002
+ field: Dict[str, Any] = {
1003
+ "name": metric_name,
1004
+ "type": "decimal(18,2)",
1005
+ "nullable": True,
1006
+ "description": str(metric.get("description") or metric.get("label") or f"dbt metric ({metric_type or 'metric'})."),
1007
+ }
1008
+ _upsert_field(metric_entity, field)
1009
+
1010
+ # Materialize relationship tests into DataLex relationships where resolvable.
1011
+ deduped: Dict[Tuple[str, str, str, str], Dict[str, str]] = {}
1012
+ for cand in relationship_candidates:
1013
+ parent = entities_by_name.get(cand["parent_entity"])
1014
+ child = entities_by_name.get(cand["child_entity"])
1015
+ if not parent or not child:
1016
+ continue
1017
+ _ensure_field(parent, cand["parent_field"])
1018
+ _ensure_field(child, cand["child_field"])
1019
+
1020
+ rel = {
1021
+ "name": f"{cand['parent_entity'].lower()}_{cand['child_entity'].lower()}_{cand['child_field']}_fk",
1022
+ "from": f"{cand['parent_entity']}.{cand['parent_field']}",
1023
+ "to": f"{cand['child_entity']}.{cand['child_field']}",
1024
+ "cardinality": "one_to_many",
1025
+ }
1026
+ key = (rel["name"], rel["from"], rel["to"], rel["cardinality"])
1027
+ deduped[key] = rel
1028
+
1029
+ model_entities = sorted(entities_by_name.values(), key=lambda e: str(e.get("name", "")))
1030
+ if not model_entities:
1031
+ model_entities = [_build_placeholder_entity()]
1032
+ for ent in model_entities:
1033
+ _ensure_non_empty_fields(ent)
1034
+ model["entities"] = model_entities
1035
+ model["relationships"] = sorted(deduped.values(), key=lambda r: str(r.get("name", "")))
1036
+ return model
1037
+
1038
+
1039
+ # ── dbt round-trip sync ───────────────────────────────────────────────────────
1040
+
1041
+ def _re_snake(name: str) -> str:
1042
+ """Convert PascalCase entity name to snake_case for dbt model name matching."""
1043
+ out: List[str] = []
1044
+ for idx, char in enumerate(name):
1045
+ if char.isupper() and idx > 0 and not name[idx - 1].isupper():
1046
+ out.append("_")
1047
+ out.append(char.lower())
1048
+ return "".join(out)
1049
+
1050
+
1051
+ def sync_dbt_schema_yml(
1052
+ model: Dict[str, Any],
1053
+ existing_dbt_schema_yml: str,
1054
+ ) -> str:
1055
+ """Merge DataLex model metadata INTO an existing dbt schema.yml (non-destructive).
1056
+
1057
+ DataLex-sourced metadata (descriptions, tags, owner, sensitivity, dimensional
1058
+ properties) is written into dbt models/columns that have empty counterparts.
1059
+ Existing dbt content (tests, ref() expressions, non-empty descriptions) is NEVER
1060
+ overwritten.
1061
+
1062
+ Returns the updated dbt schema YAML as a string.
1063
+ """
1064
+ loaded: Dict[str, Any] = yaml.safe_load(existing_dbt_schema_yml) or {}
1065
+ if not isinstance(loaded, dict):
1066
+ loaded = {}
1067
+
1068
+ # Build entity lookup by snake_case name (and stg_/fct_/dim_ prefixed variants)
1069
+ entity_by_snake: Dict[str, Dict[str, Any]] = {}
1070
+ for entity in model.get("entities", []):
1071
+ entity_name = str(entity.get("name", ""))
1072
+ snake = _re_snake(entity_name)
1073
+ entity_type = str(entity.get("type", "table"))
1074
+ entity_by_snake[snake] = entity
1075
+ # Also index under common dbt naming prefixes so lookups work both ways
1076
+ entity_by_snake[f"stg_{snake}"] = entity
1077
+ if entity_type == "fact_table":
1078
+ entity_by_snake[f"fct_{snake}"] = entity
1079
+ elif entity_type == "dimension_table":
1080
+ entity_by_snake[f"dim_{snake}"] = entity
1081
+ elif entity_type == "bridge_table":
1082
+ entity_by_snake[f"brd_{snake}"] = entity
1083
+
1084
+ dbt_models = loaded.get("models", [])
1085
+ if not isinstance(dbt_models, list):
1086
+ dbt_models = []
1087
+
1088
+ for dbt_model in dbt_models:
1089
+ if not isinstance(dbt_model, dict):
1090
+ continue
1091
+ raw_name = str(dbt_model.get("name", "")).strip()
1092
+ # Try exact match, then strip common prefixes
1093
+ entity = (
1094
+ entity_by_snake.get(raw_name)
1095
+ or entity_by_snake.get(raw_name.removeprefix("stg_"))
1096
+ or entity_by_snake.get(raw_name.removeprefix("fct_"))
1097
+ or entity_by_snake.get(raw_name.removeprefix("dim_"))
1098
+ or entity_by_snake.get(raw_name.removeprefix("brd_"))
1099
+ )
1100
+ if not entity:
1101
+ continue
1102
+
1103
+ # Fill description only if dbt model's is empty
1104
+ if not dbt_model.get("description") and entity.get("description"):
1105
+ dbt_model["description"] = entity["description"]
1106
+
1107
+ # Merge tags (union, no duplicates)
1108
+ existing_tags: set = set(dbt_model.get("tags", []) if isinstance(dbt_model.get("tags"), list) else [])
1109
+ new_tags: set = {str(t) for t in (entity.get("tags") or []) if t}
1110
+ merged = sorted(existing_tags | new_tags)
1111
+ if merged:
1112
+ dbt_model["tags"] = merged
1113
+
1114
+ # Fill meta fields
1115
+ dbt_meta = dbt_model.setdefault("meta", {})
1116
+ if not isinstance(dbt_meta, dict):
1117
+ dbt_meta = {}
1118
+ dbt_model["meta"] = dbt_meta
1119
+
1120
+ if not dbt_meta.get("owner") and entity.get("owner"):
1121
+ dbt_meta["owner"] = entity["owner"]
1122
+ if not dbt_meta.get("subject_area") and entity.get("subject_area"):
1123
+ dbt_meta["subject_area"] = entity["subject_area"]
1124
+
1125
+ # Dimensional metadata
1126
+ entity_type = str(entity.get("type", "table"))
1127
+ if entity_type in {"fact_table", "dimension_table", "bridge_table"}:
1128
+ if not dbt_meta.get("entity_type"):
1129
+ dbt_meta["entity_type"] = entity_type
1130
+ if entity.get("scd_type") and not dbt_meta.get("scd_type"):
1131
+ dbt_meta["scd_type"] = entity["scd_type"]
1132
+ if entity.get("natural_key") and not dbt_meta.get("natural_key"):
1133
+ dbt_meta["natural_key"] = entity["natural_key"]
1134
+ if entity.get("conformed") and not dbt_meta.get("conformed"):
1135
+ dbt_meta["conformed"] = True
1136
+ if entity.get("dimension_refs") and not dbt_meta.get("dimension_refs"):
1137
+ dbt_meta["dimension_refs"] = entity["dimension_refs"]
1138
+
1139
+ # Build field lookup from DataLex entity
1140
+ field_map: Dict[str, Dict[str, Any]] = {
1141
+ str(f.get("name", "")): f for f in entity.get("fields", [])
1142
+ }
1143
+
1144
+ dbt_columns = dbt_model.get("columns", [])
1145
+ if not isinstance(dbt_columns, list):
1146
+ continue
1147
+
1148
+ for col in dbt_columns:
1149
+ if not isinstance(col, dict):
1150
+ continue
1151
+ col_name = str(col.get("name", "")).strip()
1152
+ dc_field = field_map.get(col_name)
1153
+ if not dc_field:
1154
+ continue
1155
+
1156
+ # Fill column description if empty
1157
+ if not col.get("description") and dc_field.get("description"):
1158
+ col["description"] = dc_field["description"]
1159
+
1160
+ # Merge column tags
1161
+ existing_col_tags: set = set(col.get("tags", []) if isinstance(col.get("tags"), list) else [])
1162
+ new_col_tags: set = {str(t) for t in (dc_field.get("tags") or []) if t}
1163
+ merged_col = sorted(existing_col_tags | new_col_tags)
1164
+ if merged_col:
1165
+ col["tags"] = merged_col
1166
+
1167
+ # Fill column meta sensitivity if empty
1168
+ col_meta = col.setdefault("meta", {})
1169
+ if not isinstance(col_meta, dict):
1170
+ col_meta = {}
1171
+ col["meta"] = col_meta
1172
+ if not col_meta.get("sensitivity") and dc_field.get("sensitivity"):
1173
+ col_meta["sensitivity"] = dc_field["sensitivity"]
1174
+
1175
+ return yaml.dump(loaded, default_flow_style=False, sort_keys=False, allow_unicode=True)
1176
+ return model