datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
datalex_core/semantic.py
ADDED
|
@@ -0,0 +1,1561 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
4
|
+
|
|
5
|
+
from datalex_core.issues import Issue
|
|
6
|
+
from datalex_core.modeling import (
|
|
7
|
+
DATA_VAULT_ENTITY_TYPES,
|
|
8
|
+
DIMENSIONAL_ENTITY_TYPES,
|
|
9
|
+
normalize_model,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
PASCAL_CASE = re.compile(r"^[A-Z][A-Za-z0-9]*$")
|
|
13
|
+
SNAKE_CASE = re.compile(r"^[a-z][a-z0-9_]*$")
|
|
14
|
+
REL_REF = re.compile(r"^[A-Z][A-Za-z0-9]*\.[a-z][a-z0-9_]*$")
|
|
15
|
+
ALLOWED_CLASSIFICATIONS = {"PUBLIC", "INTERNAL", "CONFIDENTIAL", "PII", "PCI", "PHI"}
|
|
16
|
+
ALLOWED_SENSITIVITY = {"public", "internal", "confidential", "restricted"}
|
|
17
|
+
PK_REQUIRED_TYPES = {"table", "fact_table", "dimension_table", "hub", "link"}
|
|
18
|
+
|
|
19
|
+
# Field name patterns that imply financial/sensitive values
|
|
20
|
+
_FINANCIAL_PATTERN = re.compile(r"(amount|revenue|cost|price|fee|salary|balance|total|gross|net)", re.IGNORECASE)
|
|
21
|
+
_PII_TAGS = {"PII", "PHI", "PCI", "pii", "phi", "pci"}
|
|
22
|
+
_SENSITIVE_VALUES = {"restricted", "confidential"}
|
|
23
|
+
_AUDIT_TIMESTAMP_CREATED = re.compile(r"^created_(at|on|date|time)$")
|
|
24
|
+
_AUDIT_TIMESTAMP_UPDATED = re.compile(r"^updated_(at|on|date|time)$|^modified_(at|on|date|time)$")
|
|
25
|
+
|
|
26
|
+
# ── Completeness scoring ───────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
# Weights must sum to 100
|
|
29
|
+
_COMPLETENESS_WEIGHTS: Dict[str, int] = {
|
|
30
|
+
"description": 15, # Entity has a non-empty description
|
|
31
|
+
"owner": 10, # Entity has an owner assigned
|
|
32
|
+
"grain": 15, # Entity has at least one grain field
|
|
33
|
+
"field_descriptions": 20, # ≥80% of fields have descriptions
|
|
34
|
+
"classification": 10, # Fields with sensitivity have classification, or entity has classified fields
|
|
35
|
+
"glossary_linked": 10, # At least one glossary term cross-references this entity
|
|
36
|
+
"tags": 5, # Entity has at least one tag
|
|
37
|
+
"layer": 5, # Parent model declares a layer
|
|
38
|
+
"sla": 10, # Entity has SLA defined (freshness or quality_score)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class EntityCompleteness:
|
|
44
|
+
"""Completeness report for a single entity."""
|
|
45
|
+
entity_name: str
|
|
46
|
+
score: int # 0-100
|
|
47
|
+
dimensions: Dict[str, bool] # Which dimensions passed
|
|
48
|
+
missing: List[str] # Human-readable missing items
|
|
49
|
+
field_description_pct: int # Percentage of fields with descriptions
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ModelCompleteness:
|
|
54
|
+
"""Aggregated completeness report for an entire model."""
|
|
55
|
+
model_name: str
|
|
56
|
+
model_score: int # Average score across all entities
|
|
57
|
+
entities: List[EntityCompleteness]
|
|
58
|
+
total_entities: int
|
|
59
|
+
fully_complete: int # Entities at 100%
|
|
60
|
+
needs_attention: List[str] # Entity names below 60%
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _entity_completeness(
|
|
64
|
+
entity: Dict[str, Any],
|
|
65
|
+
model: Dict[str, Any],
|
|
66
|
+
glossary_entity_refs: Set[str],
|
|
67
|
+
) -> EntityCompleteness:
|
|
68
|
+
"""Score a single entity against the completeness dimensions."""
|
|
69
|
+
name = entity.get("name", "")
|
|
70
|
+
entity_type = entity.get("type", "table")
|
|
71
|
+
fields = entity.get("fields", [])
|
|
72
|
+
model_layer = str(model.get("model", {}).get("layer", "")).lower().strip()
|
|
73
|
+
|
|
74
|
+
# -- dimension: description
|
|
75
|
+
has_description = bool(entity.get("description", "").strip())
|
|
76
|
+
|
|
77
|
+
# -- dimension: owner
|
|
78
|
+
has_owner = bool(entity.get("owner", "").strip())
|
|
79
|
+
|
|
80
|
+
# -- dimension: grain (tables/views/MVs should define grain)
|
|
81
|
+
grain = entity.get("grain", [])
|
|
82
|
+
has_grain = isinstance(grain, list) and len(grain) > 0
|
|
83
|
+
# Views, external_table, and dimension_table are exempt from grain requirement
|
|
84
|
+
# (dimension tables use a surrogate key as PK, not a declared grain)
|
|
85
|
+
if entity_type in {"view", "external_table", "dimension_table"}:
|
|
86
|
+
has_grain = True # Not penalised
|
|
87
|
+
|
|
88
|
+
# -- dimension: field descriptions (80% threshold)
|
|
89
|
+
field_count = len(fields)
|
|
90
|
+
described_count = sum(1 for f in fields if f.get("description", "").strip())
|
|
91
|
+
if field_count == 0:
|
|
92
|
+
field_desc_pct = 100
|
|
93
|
+
has_field_descriptions = True
|
|
94
|
+
else:
|
|
95
|
+
field_desc_pct = int(described_count / field_count * 100)
|
|
96
|
+
has_field_descriptions = field_desc_pct >= 80
|
|
97
|
+
|
|
98
|
+
# -- dimension: classification
|
|
99
|
+
# Pass if any field has a sensitivity label OR the governance section covers a field in this entity
|
|
100
|
+
gov_classification = model.get("governance", {}).get("classification", {})
|
|
101
|
+
entity_gov_refs = {k for k in gov_classification if k.startswith(f"{name}.")}
|
|
102
|
+
field_sensitivities = [f for f in fields if f.get("sensitivity")]
|
|
103
|
+
has_classification = bool(entity_gov_refs) or bool(field_sensitivities)
|
|
104
|
+
# If no sensitive fields at all, grant the point (no classification needed)
|
|
105
|
+
needs_classification = any(
|
|
106
|
+
f.get("sensitivity") in _SENSITIVE_VALUES
|
|
107
|
+
or any(t in _PII_TAGS for t in (f.get("tags") or []))
|
|
108
|
+
for f in fields
|
|
109
|
+
)
|
|
110
|
+
if not needs_classification:
|
|
111
|
+
has_classification = True
|
|
112
|
+
|
|
113
|
+
# -- dimension: glossary linked
|
|
114
|
+
has_glossary_linked = name in glossary_entity_refs
|
|
115
|
+
|
|
116
|
+
# -- dimension: tags
|
|
117
|
+
has_tags = bool(entity.get("tags"))
|
|
118
|
+
|
|
119
|
+
# -- dimension: layer
|
|
120
|
+
has_layer = bool(model_layer)
|
|
121
|
+
|
|
122
|
+
# -- dimension: sla
|
|
123
|
+
sla = entity.get("sla", {})
|
|
124
|
+
has_sla = bool(sla.get("freshness") or sla.get("quality_score"))
|
|
125
|
+
|
|
126
|
+
dimensions = {
|
|
127
|
+
"description": has_description,
|
|
128
|
+
"owner": has_owner,
|
|
129
|
+
"grain": has_grain,
|
|
130
|
+
"field_descriptions": has_field_descriptions,
|
|
131
|
+
"classification": has_classification,
|
|
132
|
+
"glossary_linked": has_glossary_linked,
|
|
133
|
+
"tags": has_tags,
|
|
134
|
+
"layer": has_layer,
|
|
135
|
+
"sla": has_sla,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
score = sum(
|
|
139
|
+
_COMPLETENESS_WEIGHTS[dim]
|
|
140
|
+
for dim, passed in dimensions.items()
|
|
141
|
+
if passed
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
missing_labels = {
|
|
145
|
+
"description": "entity description",
|
|
146
|
+
"owner": "owner",
|
|
147
|
+
"grain": "grain definition",
|
|
148
|
+
"field_descriptions": f"field descriptions ({field_desc_pct}% covered, need ≥80%)",
|
|
149
|
+
"classification": "sensitivity classification on sensitive fields",
|
|
150
|
+
"glossary_linked": "glossary term cross-reference",
|
|
151
|
+
"tags": "tags",
|
|
152
|
+
"layer": "model layer (source/transform/report)",
|
|
153
|
+
"sla": "SLA (freshness or quality_score)",
|
|
154
|
+
}
|
|
155
|
+
missing = [missing_labels[dim] for dim, passed in dimensions.items() if not passed]
|
|
156
|
+
|
|
157
|
+
return EntityCompleteness(
|
|
158
|
+
entity_name=name,
|
|
159
|
+
score=score,
|
|
160
|
+
dimensions=dimensions,
|
|
161
|
+
missing=missing,
|
|
162
|
+
field_description_pct=field_desc_pct,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _glossary_entity_refs(model: Dict[str, Any]) -> Set[str]:
|
|
167
|
+
"""Return the set of entity names referenced in any glossary term's related_fields."""
|
|
168
|
+
refs: Set[str] = set()
|
|
169
|
+
for term in model.get("glossary", []):
|
|
170
|
+
for field_ref in term.get("related_fields", []):
|
|
171
|
+
if "." in str(field_ref):
|
|
172
|
+
entity = str(field_ref).split(".", 1)[0]
|
|
173
|
+
refs.add(entity)
|
|
174
|
+
return refs
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def completeness_report(model: Dict[str, Any]) -> ModelCompleteness:
|
|
178
|
+
"""Return a full completeness report for the model."""
|
|
179
|
+
model_name = model.get("model", {}).get("name", "unknown")
|
|
180
|
+
entities = model.get("entities", [])
|
|
181
|
+
glossary_refs = _glossary_entity_refs(model)
|
|
182
|
+
|
|
183
|
+
entity_scores = [
|
|
184
|
+
_entity_completeness(e, model, glossary_refs)
|
|
185
|
+
for e in entities
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
avg_score = int(sum(e.score for e in entity_scores) / len(entity_scores)) if entity_scores else 0
|
|
189
|
+
fully_complete = sum(1 for e in entity_scores if e.score == 100)
|
|
190
|
+
needs_attention = [e.entity_name for e in entity_scores if e.score < 60]
|
|
191
|
+
|
|
192
|
+
return ModelCompleteness(
|
|
193
|
+
model_name=model_name,
|
|
194
|
+
model_score=avg_score,
|
|
195
|
+
entities=entity_scores,
|
|
196
|
+
total_entities=len(entity_scores),
|
|
197
|
+
fully_complete=fully_complete,
|
|
198
|
+
needs_attention=needs_attention,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def completeness_as_dict(report: ModelCompleteness) -> Dict[str, Any]:
|
|
203
|
+
"""Serialise a ModelCompleteness report to a plain dict (JSON-safe)."""
|
|
204
|
+
return {
|
|
205
|
+
"model_name": report.model_name,
|
|
206
|
+
"model_score": report.model_score,
|
|
207
|
+
"total_entities": report.total_entities,
|
|
208
|
+
"fully_complete": report.fully_complete,
|
|
209
|
+
"needs_attention": report.needs_attention,
|
|
210
|
+
"weights": _COMPLETENESS_WEIGHTS,
|
|
211
|
+
"entities": [
|
|
212
|
+
{
|
|
213
|
+
"name": e.entity_name,
|
|
214
|
+
"score": e.score,
|
|
215
|
+
"field_description_pct": e.field_description_pct,
|
|
216
|
+
"dimensions": e.dimensions,
|
|
217
|
+
"missing": e.missing,
|
|
218
|
+
}
|
|
219
|
+
for e in report.entities
|
|
220
|
+
],
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _entity_field_refs(model: Dict[str, Any]) -> Set[str]:
|
|
225
|
+
refs: Set[str] = set()
|
|
226
|
+
for entity in model.get("entities", []):
|
|
227
|
+
entity_name = entity.get("name", "")
|
|
228
|
+
for field in entity.get("fields", []):
|
|
229
|
+
field_name = field.get("name", "")
|
|
230
|
+
if entity_name and field_name:
|
|
231
|
+
refs.add(f"{entity_name}.{field_name}")
|
|
232
|
+
return refs
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _entity_names(model: Dict[str, Any]) -> Set[str]:
|
|
236
|
+
return {entity.get("name", "") for entity in model.get("entities", []) if entity.get("name")}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _entity_field_names(model: Dict[str, Any]) -> Dict[str, Set[str]]:
|
|
240
|
+
result: Dict[str, Set[str]] = {}
|
|
241
|
+
for entity in model.get("entities", []):
|
|
242
|
+
entity_name = entity.get("name", "")
|
|
243
|
+
if not entity_name:
|
|
244
|
+
continue
|
|
245
|
+
names: Set[str] = set()
|
|
246
|
+
for field in entity.get("fields", []):
|
|
247
|
+
field_name = field.get("name", "")
|
|
248
|
+
if field_name:
|
|
249
|
+
names.add(field_name)
|
|
250
|
+
result[entity_name] = names
|
|
251
|
+
return result
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _entity_map(model: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
255
|
+
return {
|
|
256
|
+
str(entity.get("name", "")): entity
|
|
257
|
+
for entity in model.get("entities", [])
|
|
258
|
+
if isinstance(entity, dict) and entity.get("name")
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _normalized_keysets(value: Any) -> List[Tuple[str, ...]]:
|
|
263
|
+
result: List[Tuple[str, ...]] = []
|
|
264
|
+
if not isinstance(value, list):
|
|
265
|
+
return result
|
|
266
|
+
for keyset in value:
|
|
267
|
+
if not isinstance(keyset, list):
|
|
268
|
+
continue
|
|
269
|
+
cleaned = [str(item) for item in keyset if str(item).strip()]
|
|
270
|
+
result.append(tuple(sorted(cleaned)))
|
|
271
|
+
return result
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _relationship_graph(model: Dict[str, Any]) -> Dict[str, Set[str]]:
|
|
275
|
+
graph: Dict[str, Set[str]] = {}
|
|
276
|
+
for rel in model.get("relationships", []):
|
|
277
|
+
from_ref = rel.get("from", "")
|
|
278
|
+
to_ref = rel.get("to", "")
|
|
279
|
+
if "." not in from_ref or "." not in to_ref:
|
|
280
|
+
continue
|
|
281
|
+
src = from_ref.split(".", 1)[0]
|
|
282
|
+
dst = to_ref.split(".", 1)[0]
|
|
283
|
+
graph.setdefault(src, set()).add(dst)
|
|
284
|
+
graph.setdefault(dst, set())
|
|
285
|
+
return graph
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _has_cycle(graph: Dict[str, Set[str]]) -> bool:
|
|
289
|
+
state: Dict[str, int] = {node: 0 for node in graph}
|
|
290
|
+
|
|
291
|
+
def visit(node: str) -> bool:
|
|
292
|
+
if state[node] == 1:
|
|
293
|
+
return True
|
|
294
|
+
if state[node] == 2:
|
|
295
|
+
return False
|
|
296
|
+
state[node] = 1
|
|
297
|
+
for nxt in graph.get(node, set()):
|
|
298
|
+
if visit(nxt):
|
|
299
|
+
return True
|
|
300
|
+
state[node] = 2
|
|
301
|
+
return False
|
|
302
|
+
|
|
303
|
+
for node in graph:
|
|
304
|
+
if state[node] == 0 and visit(node):
|
|
305
|
+
return True
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _lint_indexes(model: Dict[str, Any], entity_field_map: Dict[str, Set[str]]) -> List[Issue]:
|
|
310
|
+
issues: List[Issue] = []
|
|
311
|
+
seen_index_names: Set[str] = set()
|
|
312
|
+
|
|
313
|
+
for idx_def in model.get("indexes", []):
|
|
314
|
+
idx_name = idx_def.get("name", "")
|
|
315
|
+
entity_name = idx_def.get("entity", "")
|
|
316
|
+
idx_fields = idx_def.get("fields", [])
|
|
317
|
+
|
|
318
|
+
if idx_name in seen_index_names:
|
|
319
|
+
issues.append(
|
|
320
|
+
Issue(
|
|
321
|
+
severity="error",
|
|
322
|
+
code="DUPLICATE_INDEX",
|
|
323
|
+
message=f"Duplicate index name '{idx_name}'.",
|
|
324
|
+
path="/indexes",
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
seen_index_names.add(idx_name)
|
|
329
|
+
|
|
330
|
+
if entity_name and entity_name not in entity_field_map:
|
|
331
|
+
issues.append(
|
|
332
|
+
Issue(
|
|
333
|
+
severity="error",
|
|
334
|
+
code="INDEX_ENTITY_NOT_FOUND",
|
|
335
|
+
message=f"Index '{idx_name}' references non-existent entity '{entity_name}'.",
|
|
336
|
+
path="/indexes",
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
entity_fields = entity_field_map.get(entity_name, set())
|
|
342
|
+
for field_name in idx_fields:
|
|
343
|
+
if field_name and field_name not in entity_fields:
|
|
344
|
+
issues.append(
|
|
345
|
+
Issue(
|
|
346
|
+
severity="error",
|
|
347
|
+
code="INDEX_FIELD_NOT_FOUND",
|
|
348
|
+
message=f"Index '{idx_name}' references non-existent field '{entity_name}.{field_name}'.",
|
|
349
|
+
path="/indexes",
|
|
350
|
+
)
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
return issues
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _lint_glossary(model: Dict[str, Any], refs: Set[str]) -> List[Issue]:
|
|
357
|
+
issues: List[Issue] = []
|
|
358
|
+
seen_terms: Set[str] = set()
|
|
359
|
+
|
|
360
|
+
for term_def in model.get("glossary", []):
|
|
361
|
+
term = term_def.get("term", "")
|
|
362
|
+
|
|
363
|
+
if term in seen_terms:
|
|
364
|
+
issues.append(
|
|
365
|
+
Issue(
|
|
366
|
+
severity="warn",
|
|
367
|
+
code="DUPLICATE_GLOSSARY_TERM",
|
|
368
|
+
message=f"Duplicate glossary term '{term}'.",
|
|
369
|
+
path="/glossary",
|
|
370
|
+
)
|
|
371
|
+
)
|
|
372
|
+
else:
|
|
373
|
+
seen_terms.add(term)
|
|
374
|
+
|
|
375
|
+
for field_ref in term_def.get("related_fields", []):
|
|
376
|
+
if field_ref and field_ref not in refs:
|
|
377
|
+
issues.append(
|
|
378
|
+
Issue(
|
|
379
|
+
severity="error",
|
|
380
|
+
code="GLOSSARY_REF_NOT_FOUND",
|
|
381
|
+
message=f"Glossary term '{term}' references non-existent field '{field_ref}'.",
|
|
382
|
+
path="/glossary",
|
|
383
|
+
)
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return issues
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _lint_grain_and_metrics(
|
|
390
|
+
model: Dict[str, Any],
|
|
391
|
+
entity_field_map: Dict[str, Set[str]],
|
|
392
|
+
) -> List[Issue]:
|
|
393
|
+
issues: List[Issue] = []
|
|
394
|
+
model_layer = str(model.get("model", {}).get("layer", "")).lower().strip()
|
|
395
|
+
requires_grain = model_layer in {"transform", "report"}
|
|
396
|
+
|
|
397
|
+
# Entity grain checks
|
|
398
|
+
for entity in model.get("entities", []):
|
|
399
|
+
entity_name = str(entity.get("name", ""))
|
|
400
|
+
entity_type = str(entity.get("type", "table"))
|
|
401
|
+
grain = entity.get("grain", []) if isinstance(entity.get("grain"), list) else []
|
|
402
|
+
entity_fields = entity_field_map.get(entity_name, set())
|
|
403
|
+
|
|
404
|
+
if requires_grain and entity_type in {"table", "view", "materialized_view", "fact_table"} and not grain:
|
|
405
|
+
issues.append(
|
|
406
|
+
Issue(
|
|
407
|
+
severity="error",
|
|
408
|
+
code="MISSING_GRAIN",
|
|
409
|
+
message=f"Entity '{entity_name}' must declare grain in '{model_layer}' layer models.",
|
|
410
|
+
path=f"/entities/{entity_name}",
|
|
411
|
+
)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
seen_grain: Set[str] = set()
|
|
415
|
+
for field_name in grain:
|
|
416
|
+
if field_name in seen_grain:
|
|
417
|
+
issues.append(
|
|
418
|
+
Issue(
|
|
419
|
+
severity="error",
|
|
420
|
+
code="DUPLICATE_GRAIN_FIELD",
|
|
421
|
+
message=f"Entity '{entity_name}' grain contains duplicate field '{field_name}'.",
|
|
422
|
+
path=f"/entities/{entity_name}/grain",
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
seen_grain.add(field_name)
|
|
426
|
+
|
|
427
|
+
if field_name not in entity_fields:
|
|
428
|
+
issues.append(
|
|
429
|
+
Issue(
|
|
430
|
+
severity="error",
|
|
431
|
+
code="GRAIN_FIELD_NOT_FOUND",
|
|
432
|
+
message=f"Entity '{entity_name}' grain references non-existent field '{field_name}'.",
|
|
433
|
+
path=f"/entities/{entity_name}/grain",
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# Metric checks
|
|
438
|
+
metrics = model.get("metrics", [])
|
|
439
|
+
if model_layer == "report" and not metrics:
|
|
440
|
+
issues.append(
|
|
441
|
+
Issue(
|
|
442
|
+
severity="error",
|
|
443
|
+
code="MISSING_METRICS",
|
|
444
|
+
message="Report layer models must define at least one metric.",
|
|
445
|
+
path="/metrics",
|
|
446
|
+
)
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
seen_metric_names: Set[str] = set()
|
|
450
|
+
for metric in metrics:
|
|
451
|
+
name = str(metric.get("name", ""))
|
|
452
|
+
entity_name = str(metric.get("entity", ""))
|
|
453
|
+
entity_fields = entity_field_map.get(entity_name, set())
|
|
454
|
+
|
|
455
|
+
if name in seen_metric_names:
|
|
456
|
+
issues.append(
|
|
457
|
+
Issue(
|
|
458
|
+
severity="error",
|
|
459
|
+
code="DUPLICATE_METRIC",
|
|
460
|
+
message=f"Duplicate metric name '{name}'.",
|
|
461
|
+
path="/metrics",
|
|
462
|
+
)
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
seen_metric_names.add(name)
|
|
466
|
+
|
|
467
|
+
if entity_name not in entity_field_map:
|
|
468
|
+
issues.append(
|
|
469
|
+
Issue(
|
|
470
|
+
severity="error",
|
|
471
|
+
code="METRIC_ENTITY_NOT_FOUND",
|
|
472
|
+
message=f"Metric '{name}' references non-existent entity '{entity_name}'.",
|
|
473
|
+
path="/metrics",
|
|
474
|
+
)
|
|
475
|
+
)
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
for grain_field in metric.get("grain", []) if isinstance(metric.get("grain"), list) else []:
|
|
479
|
+
if grain_field not in entity_fields:
|
|
480
|
+
issues.append(
|
|
481
|
+
Issue(
|
|
482
|
+
severity="error",
|
|
483
|
+
code="METRIC_GRAIN_FIELD_NOT_FOUND",
|
|
484
|
+
message=f"Metric '{name}' grain field '{entity_name}.{grain_field}' does not exist.",
|
|
485
|
+
path=f"/metrics/{name}",
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
for dim_field in metric.get("dimensions", []) if isinstance(metric.get("dimensions"), list) else []:
|
|
490
|
+
if dim_field not in entity_fields:
|
|
491
|
+
issues.append(
|
|
492
|
+
Issue(
|
|
493
|
+
severity="error",
|
|
494
|
+
code="METRIC_DIMENSION_NOT_FOUND",
|
|
495
|
+
message=f"Metric '{name}' dimension field '{entity_name}.{dim_field}' does not exist.",
|
|
496
|
+
path=f"/metrics/{name}",
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
time_dim = str(metric.get("time_dimension", "")).strip()
|
|
501
|
+
if time_dim and time_dim not in entity_fields:
|
|
502
|
+
issues.append(
|
|
503
|
+
Issue(
|
|
504
|
+
severity="error",
|
|
505
|
+
code="METRIC_TIME_DIMENSION_NOT_FOUND",
|
|
506
|
+
message=f"Metric '{name}' time_dimension '{entity_name}.{time_dim}' does not exist.",
|
|
507
|
+
path=f"/metrics/{name}",
|
|
508
|
+
)
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
if metric.get("deprecated") is True and not metric.get("deprecated_message"):
|
|
512
|
+
issues.append(
|
|
513
|
+
Issue(
|
|
514
|
+
severity="warn",
|
|
515
|
+
code="METRIC_DEPRECATED_WITHOUT_MESSAGE",
|
|
516
|
+
message=f"Metric '{name}' is deprecated but missing deprecated_message.",
|
|
517
|
+
path=f"/metrics/{name}",
|
|
518
|
+
)
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return issues
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _lint_smart_nudges(
|
|
525
|
+
model: Dict[str, Any],
|
|
526
|
+
entity_field_map: Dict[str, Set[str]],
|
|
527
|
+
refs: Set[str],
|
|
528
|
+
) -> List[Issue]:
|
|
529
|
+
"""
|
|
530
|
+
Context-aware gap detection rules that surface missing best-practice
|
|
531
|
+
metadata before hard validation catches structural errors. All issues
|
|
532
|
+
are severity='warn' so they never block CI on their own.
|
|
533
|
+
"""
|
|
534
|
+
issues: List[Issue] = []
|
|
535
|
+
model_layer = str(model.get("model", {}).get("layer", "")).lower().strip()
|
|
536
|
+
gov_classification: Dict[str, str] = model.get("governance", {}).get("classification", {}) or {}
|
|
537
|
+
glossary_terms = model.get("glossary", [])
|
|
538
|
+
relationships = model.get("relationships", [])
|
|
539
|
+
imports = model.get("model", {}).get("imports", [])
|
|
540
|
+
|
|
541
|
+
# Collect entity names referenced from relationships
|
|
542
|
+
rel_entity_names: Set[str] = set()
|
|
543
|
+
for rel in relationships:
|
|
544
|
+
for side in ("from", "to"):
|
|
545
|
+
ref = rel.get(side, "")
|
|
546
|
+
if "." in ref:
|
|
547
|
+
rel_entity_names.add(ref.split(".", 1)[0])
|
|
548
|
+
|
|
549
|
+
# Collect entity names that glossary terms point to
|
|
550
|
+
glossary_entity_refs = _glossary_entity_refs(model)
|
|
551
|
+
|
|
552
|
+
# Collect all imported entity names
|
|
553
|
+
imported_entity_names: Set[str] = set()
|
|
554
|
+
for imp in imports:
|
|
555
|
+
for ent in imp.get("entities", []):
|
|
556
|
+
imported_entity_names.add(str(ent))
|
|
557
|
+
|
|
558
|
+
for entity in model.get("entities", []):
|
|
559
|
+
entity_name = entity.get("name", "")
|
|
560
|
+
entity_type = entity.get("type", "table")
|
|
561
|
+
fields = entity.get("fields", [])
|
|
562
|
+
field_count = len(fields)
|
|
563
|
+
path = f"/entities/{entity_name}"
|
|
564
|
+
|
|
565
|
+
# ── Nudge 1: Missing entity description ───────────────────────────────
|
|
566
|
+
if not entity.get("description", "").strip():
|
|
567
|
+
issues.append(Issue(
|
|
568
|
+
severity="warn",
|
|
569
|
+
code="MISSING_ENTITY_DESCRIPTION",
|
|
570
|
+
message=(
|
|
571
|
+
f"Entity '{entity_name}' has no description. "
|
|
572
|
+
"Add a business-facing description so consumers know what this entity represents."
|
|
573
|
+
),
|
|
574
|
+
path=path,
|
|
575
|
+
))
|
|
576
|
+
|
|
577
|
+
# ── Nudge 2: Missing entity owner ─────────────────────────────────────
|
|
578
|
+
if not entity.get("owner", "").strip():
|
|
579
|
+
issues.append(Issue(
|
|
580
|
+
severity="warn",
|
|
581
|
+
code="MISSING_ENTITY_OWNER",
|
|
582
|
+
message=(
|
|
583
|
+
f"Entity '{entity_name}' has no owner. "
|
|
584
|
+
"Assign an owner (email or team alias) for accountability and stewardship."
|
|
585
|
+
),
|
|
586
|
+
path=path,
|
|
587
|
+
))
|
|
588
|
+
|
|
589
|
+
# ── Nudge 3: Source-layer table missing grain ─────────────────────────
|
|
590
|
+
# transform/report is already an error via _lint_grain_and_metrics;
|
|
591
|
+
# source layer should also declare grain as a best practice.
|
|
592
|
+
if (
|
|
593
|
+
model_layer == "source"
|
|
594
|
+
and entity_type in {"table", "materialized_view"}
|
|
595
|
+
and not entity.get("grain")
|
|
596
|
+
):
|
|
597
|
+
issues.append(Issue(
|
|
598
|
+
severity="warn",
|
|
599
|
+
code="MISSING_GRAIN_SOURCE_LAYER",
|
|
600
|
+
message=(
|
|
601
|
+
f"Entity '{entity_name}' in a source-layer model has no grain defined. "
|
|
602
|
+
"Declaring grain clarifies the unit of observation and prevents downstream metric errors."
|
|
603
|
+
),
|
|
604
|
+
path=path,
|
|
605
|
+
))
|
|
606
|
+
|
|
607
|
+
# ── Nudge 4: Sensitive field tag without governance classification ─────
|
|
608
|
+
for f in fields:
|
|
609
|
+
fname = f.get("name", "")
|
|
610
|
+
field_tags = [str(t) for t in (f.get("tags") or [])]
|
|
611
|
+
field_ref = f"{entity_name}.{fname}"
|
|
612
|
+
has_pii_tag = any(t in _PII_TAGS for t in field_tags)
|
|
613
|
+
if has_pii_tag and field_ref not in gov_classification:
|
|
614
|
+
issues.append(Issue(
|
|
615
|
+
severity="warn",
|
|
616
|
+
code="PII_TAG_WITHOUT_CLASSIFICATION",
|
|
617
|
+
message=(
|
|
618
|
+
f"Field '{field_ref}' has a PII/PHI/PCI tag but no governance.classification entry. "
|
|
619
|
+
"Add a classification so policy checks and data contracts can enforce access controls."
|
|
620
|
+
),
|
|
621
|
+
path=f"{path}/fields/{fname}",
|
|
622
|
+
))
|
|
623
|
+
|
|
624
|
+
# ── Nudge 5: sensitivity=restricted/confidential without classification
|
|
625
|
+
for f in fields:
|
|
626
|
+
fname = f.get("name", "")
|
|
627
|
+
sensitivity = str(f.get("sensitivity", "")).lower()
|
|
628
|
+
field_ref = f"{entity_name}.{fname}"
|
|
629
|
+
if sensitivity in _SENSITIVE_VALUES and field_ref not in gov_classification:
|
|
630
|
+
issues.append(Issue(
|
|
631
|
+
severity="warn",
|
|
632
|
+
code="SENSITIVITY_WITHOUT_CLASSIFICATION",
|
|
633
|
+
message=(
|
|
634
|
+
f"Field '{field_ref}' has sensitivity='{sensitivity}' but no governance.classification. "
|
|
635
|
+
"Pair sensitivity labels with an explicit classification (PII, PCI, PHI, CONFIDENTIAL)."
|
|
636
|
+
),
|
|
637
|
+
path=f"{path}/fields/{fname}",
|
|
638
|
+
))
|
|
639
|
+
|
|
640
|
+
# ── Nudge 6: Financial/amount fields with no examples ─────────────────
|
|
641
|
+
for f in fields:
|
|
642
|
+
fname = f.get("name", "")
|
|
643
|
+
if _FINANCIAL_PATTERN.search(fname) and not f.get("examples"):
|
|
644
|
+
issues.append(Issue(
|
|
645
|
+
severity="warn",
|
|
646
|
+
code="FINANCIAL_FIELD_NO_EXAMPLES",
|
|
647
|
+
message=(
|
|
648
|
+
f"Field '{entity_name}.{fname}' looks like a financial value but has no examples. "
|
|
649
|
+
"Add examples (e.g. unit, currency, scale) so consumers interpret it correctly."
|
|
650
|
+
),
|
|
651
|
+
path=f"{path}/fields/{fname}",
|
|
652
|
+
))
|
|
653
|
+
|
|
654
|
+
# ── Nudge 7: created_at present but no updated_at ─────────────────────
|
|
655
|
+
field_names_list = [f.get("name", "") for f in fields]
|
|
656
|
+
has_created = any(_AUDIT_TIMESTAMP_CREATED.match(n) for n in field_names_list)
|
|
657
|
+
has_updated = any(_AUDIT_TIMESTAMP_UPDATED.match(n) for n in field_names_list)
|
|
658
|
+
if has_created and not has_updated and entity_type == "table":
|
|
659
|
+
issues.append(Issue(
|
|
660
|
+
severity="warn",
|
|
661
|
+
code="CREATED_WITHOUT_UPDATED",
|
|
662
|
+
message=(
|
|
663
|
+
f"Entity '{entity_name}' has a created_at timestamp but no updated_at equivalent. "
|
|
664
|
+
"If records are mutable, add an updated_at field to support incremental loads."
|
|
665
|
+
),
|
|
666
|
+
path=path,
|
|
667
|
+
))
|
|
668
|
+
|
|
669
|
+
# ── Nudge 8: Low field description coverage (<50%) ────────────────────
|
|
670
|
+
if field_count > 0:
|
|
671
|
+
described = sum(1 for f in fields if f.get("description", "").strip())
|
|
672
|
+
pct = described / field_count * 100
|
|
673
|
+
if pct < 50:
|
|
674
|
+
issues.append(Issue(
|
|
675
|
+
severity="warn",
|
|
676
|
+
code="LOW_FIELD_DESCRIPTION_COVERAGE",
|
|
677
|
+
message=(
|
|
678
|
+
f"Entity '{entity_name}' has only {pct:.0f}% of fields described "
|
|
679
|
+
f"({described}/{field_count}). "
|
|
680
|
+
"Add descriptions to make this entity usable as a single source of truth."
|
|
681
|
+
),
|
|
682
|
+
path=path,
|
|
683
|
+
))
|
|
684
|
+
|
|
685
|
+
# ── Nudge 9: Large entity (>10 fields) with no indexes ────────────────
|
|
686
|
+
if field_count > 10 and entity_type == "table":
|
|
687
|
+
entity_indexes = [
|
|
688
|
+
idx for idx in model.get("indexes", [])
|
|
689
|
+
if idx.get("entity") == entity_name
|
|
690
|
+
]
|
|
691
|
+
if not entity_indexes:
|
|
692
|
+
issues.append(Issue(
|
|
693
|
+
severity="warn",
|
|
694
|
+
code="LARGE_ENTITY_NO_INDEXES",
|
|
695
|
+
message=(
|
|
696
|
+
f"Entity '{entity_name}' has {field_count} fields but no indexes defined. "
|
|
697
|
+
"Consider adding indexes on frequently queried or join columns."
|
|
698
|
+
),
|
|
699
|
+
path=path,
|
|
700
|
+
))
|
|
701
|
+
|
|
702
|
+
# ── Nudge 10: Report-layer entity not covered by any metric ───────────
|
|
703
|
+
if model_layer == "report" and entity_type in {"table", "materialized_view"}:
|
|
704
|
+
entity_metrics = [
|
|
705
|
+
m for m in model.get("metrics", [])
|
|
706
|
+
if m.get("entity") == entity_name
|
|
707
|
+
]
|
|
708
|
+
if not entity_metrics:
|
|
709
|
+
issues.append(Issue(
|
|
710
|
+
severity="warn",
|
|
711
|
+
code="REPORT_ENTITY_NO_METRICS",
|
|
712
|
+
message=(
|
|
713
|
+
f"Entity '{entity_name}' is in a report-layer model but has no metrics defined for it. "
|
|
714
|
+
"Report entities should expose at least one metric (KPI, aggregate, or measure)."
|
|
715
|
+
),
|
|
716
|
+
path=path,
|
|
717
|
+
))
|
|
718
|
+
|
|
719
|
+
# ── Nudge 13: fact_table without dimension_refs ───────────────────────
|
|
720
|
+
if entity_type == "fact_table":
|
|
721
|
+
dim_refs = entity.get("dimension_refs", [])
|
|
722
|
+
if not isinstance(dim_refs, list) or not dim_refs:
|
|
723
|
+
issues.append(Issue(
|
|
724
|
+
severity="warn",
|
|
725
|
+
code="FACT_WITHOUT_DIMENSION_REFS",
|
|
726
|
+
message=(
|
|
727
|
+
f"Fact table '{entity_name}' has no dimension_refs defined. "
|
|
728
|
+
"Declare which dimensions this fact references for star schema clarity and auto-layout."
|
|
729
|
+
),
|
|
730
|
+
path=path,
|
|
731
|
+
))
|
|
732
|
+
|
|
733
|
+
# ── Nudge 14: dimension_table without natural_key ─────────────────────
|
|
734
|
+
if entity_type == "dimension_table" and not entity.get("natural_key", "").strip():
|
|
735
|
+
issues.append(Issue(
|
|
736
|
+
severity="warn",
|
|
737
|
+
code="DIM_WITHOUT_NATURAL_KEY",
|
|
738
|
+
message=(
|
|
739
|
+
f"Dimension table '{entity_name}' has no natural_key defined. "
|
|
740
|
+
"Declare the business key so SCD tracking and deduplication work correctly."
|
|
741
|
+
),
|
|
742
|
+
path=path,
|
|
743
|
+
))
|
|
744
|
+
|
|
745
|
+
# ── Nudge 15: SCD Type 2 dimension missing system fields ──────────────
|
|
746
|
+
if entity_type == "dimension_table" and entity.get("scd_type") == 2:
|
|
747
|
+
field_names_set = {f.get("name", "") for f in fields}
|
|
748
|
+
scd2_required = {"effective_from", "effective_to", "is_current"}
|
|
749
|
+
missing_scd2 = scd2_required - field_names_set
|
|
750
|
+
if missing_scd2:
|
|
751
|
+
missing_str = ", ".join(sorted(missing_scd2))
|
|
752
|
+
issues.append(Issue(
|
|
753
|
+
severity="warn",
|
|
754
|
+
code="SCD2_MISSING_SYSTEM_FIELDS",
|
|
755
|
+
message=(
|
|
756
|
+
f"SCD Type 2 dimension '{entity_name}' is missing system fields: {missing_str}. "
|
|
757
|
+
"Add effective_from (DATE), effective_to (DATE), and is_current (BOOLEAN) to track historical validity."
|
|
758
|
+
),
|
|
759
|
+
path=path,
|
|
760
|
+
))
|
|
761
|
+
|
|
762
|
+
# ── Nudge 16: fact_table in report layer with no metrics ──────────────
|
|
763
|
+
if entity_type == "fact_table" and model_layer == "report":
|
|
764
|
+
entity_metrics = [m for m in model.get("metrics", []) if m.get("entity") == entity_name]
|
|
765
|
+
if not entity_metrics:
|
|
766
|
+
issues.append(Issue(
|
|
767
|
+
severity="warn",
|
|
768
|
+
code="FACT_TABLE_NO_METRICS",
|
|
769
|
+
message=(
|
|
770
|
+
f"Fact table '{entity_name}' is in a report-layer model but has no metrics defined. "
|
|
771
|
+
"Define at least one metric (measure/KPI) on this fact table."
|
|
772
|
+
),
|
|
773
|
+
path=path,
|
|
774
|
+
))
|
|
775
|
+
|
|
776
|
+
# ── Model-level nudges ────────────────────────────────────────────────────
|
|
777
|
+
|
|
778
|
+
# ── Nudge 11: Glossary defined but no terms cross-reference any field ─────
|
|
779
|
+
if glossary_terms:
|
|
780
|
+
any_refs = any(term.get("related_fields") for term in glossary_terms)
|
|
781
|
+
if not any_refs:
|
|
782
|
+
issues.append(Issue(
|
|
783
|
+
severity="warn",
|
|
784
|
+
code="GLOSSARY_NO_FIELD_REFS",
|
|
785
|
+
message=(
|
|
786
|
+
"The glossary has terms defined but none have related_fields cross-references. "
|
|
787
|
+
"Link terms to physical fields so the business dictionary connects to the data model."
|
|
788
|
+
),
|
|
789
|
+
path="/glossary",
|
|
790
|
+
))
|
|
791
|
+
|
|
792
|
+
# ── Nudge 12: Imports declared but imported entities unused in relationships
|
|
793
|
+
if imported_entity_names:
|
|
794
|
+
used_in_rels = rel_entity_names & imported_entity_names
|
|
795
|
+
unused_imports = imported_entity_names - used_in_rels
|
|
796
|
+
# Also check if they appear as FK targets in field-level refs (in refs set)
|
|
797
|
+
unused_imports = {
|
|
798
|
+
e for e in unused_imports
|
|
799
|
+
if not any(ref.startswith(f"{e}.") for ref in refs)
|
|
800
|
+
}
|
|
801
|
+
# Remove locally-defined entities from the check
|
|
802
|
+
local_entities = {ent.get("name", "") for ent in model.get("entities", [])}
|
|
803
|
+
unused_imports -= local_entities
|
|
804
|
+
for ent_name in sorted(unused_imports):
|
|
805
|
+
issues.append(Issue(
|
|
806
|
+
severity="warn",
|
|
807
|
+
code="ORPHAN_IMPORT_ENTITY",
|
|
808
|
+
message=(
|
|
809
|
+
f"Imported entity '{ent_name}' is never referenced in relationships or foreign keys. "
|
|
810
|
+
"Either use it in a relationship definition or remove the import to keep the model clean."
|
|
811
|
+
),
|
|
812
|
+
path="/model/imports",
|
|
813
|
+
))
|
|
814
|
+
|
|
815
|
+
return issues
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _lint_phase1_modeling_core(
|
|
819
|
+
model: Dict[str, Any],
|
|
820
|
+
entity_field_map: Dict[str, Set[str]],
|
|
821
|
+
) -> List[Issue]:
|
|
822
|
+
issues: List[Issue] = []
|
|
823
|
+
entities_by_name = _entity_map(model)
|
|
824
|
+
kind = str(model.get("model", {}).get("kind", "physical")).lower().strip()
|
|
825
|
+
model_layer = str(model.get("model", {}).get("layer", "")).lower().strip()
|
|
826
|
+
|
|
827
|
+
if kind in {"conceptual", "logical"} and model_layer:
|
|
828
|
+
issues.append(
|
|
829
|
+
Issue(
|
|
830
|
+
severity="warn",
|
|
831
|
+
code="PIPELINE_LAYER_ON_NON_PHYSICAL_MODEL",
|
|
832
|
+
message=(
|
|
833
|
+
f"Model kind '{kind}' also declares pipeline layer '{model_layer}'. "
|
|
834
|
+
"Pipeline layers are usually reserved for physical execution models."
|
|
835
|
+
),
|
|
836
|
+
path="/model/layer",
|
|
837
|
+
)
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
subtype_graph: Dict[str, Set[str]] = {name: set() for name in entities_by_name}
|
|
841
|
+
|
|
842
|
+
def validate_keysets(entity_name: str, keysets: Any, local_fields: Set[str], path_key: str, prefix: str) -> None:
|
|
843
|
+
seen: Set[Tuple[str, ...]] = set()
|
|
844
|
+
if not isinstance(keysets, list):
|
|
845
|
+
return
|
|
846
|
+
for index, keyset in enumerate(keysets):
|
|
847
|
+
if not isinstance(keyset, list) or not keyset:
|
|
848
|
+
issues.append(
|
|
849
|
+
Issue(
|
|
850
|
+
severity="error",
|
|
851
|
+
code=f"{prefix}_EMPTY",
|
|
852
|
+
message=f"Entity '{entity_name}' {path_key} entry {index + 1} must contain at least one field.",
|
|
853
|
+
path=f"/entities/{entity_name}/{path_key}",
|
|
854
|
+
)
|
|
855
|
+
)
|
|
856
|
+
continue
|
|
857
|
+
local_seen: Set[str] = set()
|
|
858
|
+
for field_name in keyset:
|
|
859
|
+
name = str(field_name)
|
|
860
|
+
if name in local_seen:
|
|
861
|
+
issues.append(
|
|
862
|
+
Issue(
|
|
863
|
+
severity="error",
|
|
864
|
+
code=f"{prefix}_DUPLICATE_FIELD",
|
|
865
|
+
message=f"Entity '{entity_name}' {path_key} contains duplicate field '{name}'.",
|
|
866
|
+
path=f"/entities/{entity_name}/{path_key}",
|
|
867
|
+
)
|
|
868
|
+
)
|
|
869
|
+
else:
|
|
870
|
+
local_seen.add(name)
|
|
871
|
+
if name not in local_fields:
|
|
872
|
+
issues.append(
|
|
873
|
+
Issue(
|
|
874
|
+
severity="error",
|
|
875
|
+
code=f"{prefix}_FIELD_NOT_FOUND",
|
|
876
|
+
message=f"Entity '{entity_name}' {path_key} field '{name}' does not exist.",
|
|
877
|
+
path=f"/entities/{entity_name}/{path_key}",
|
|
878
|
+
)
|
|
879
|
+
)
|
|
880
|
+
signature = tuple(sorted(local_seen))
|
|
881
|
+
if signature in seen:
|
|
882
|
+
issues.append(
|
|
883
|
+
Issue(
|
|
884
|
+
severity="warn",
|
|
885
|
+
code=f"{prefix}_DUPLICATE_SET",
|
|
886
|
+
message=f"Entity '{entity_name}' declares the same {path_key} more than once.",
|
|
887
|
+
path=f"/entities/{entity_name}/{path_key}",
|
|
888
|
+
)
|
|
889
|
+
)
|
|
890
|
+
else:
|
|
891
|
+
seen.add(signature)
|
|
892
|
+
|
|
893
|
+
def validate_field_property(entity_name: str, local_fields: Set[str], prop_name: str, code: str) -> None:
|
|
894
|
+
field_name = str(entities_by_name[entity_name].get(prop_name) or "").strip()
|
|
895
|
+
if not field_name:
|
|
896
|
+
issues.append(
|
|
897
|
+
Issue(
|
|
898
|
+
severity="error",
|
|
899
|
+
code=f"MISSING_{code}",
|
|
900
|
+
message=f"Entity '{entity_name}' must declare '{prop_name}'.",
|
|
901
|
+
path=f"/entities/{entity_name}/{prop_name}",
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
return
|
|
905
|
+
if field_name not in local_fields:
|
|
906
|
+
issues.append(
|
|
907
|
+
Issue(
|
|
908
|
+
severity="error",
|
|
909
|
+
code=f"{code}_FIELD_NOT_FOUND",
|
|
910
|
+
message=f"Entity '{entity_name}' {prop_name} field '{field_name}' does not exist.",
|
|
911
|
+
path=f"/entities/{entity_name}/{prop_name}",
|
|
912
|
+
)
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
for entity_name, entity in entities_by_name.items():
|
|
916
|
+
entity_type = str(entity.get("type", "table"))
|
|
917
|
+
local_fields = entity_field_map.get(entity_name, set())
|
|
918
|
+
|
|
919
|
+
validate_keysets(entity_name, entity.get("candidate_keys"), local_fields, "candidate_keys", "CANDIDATE_KEY")
|
|
920
|
+
validate_keysets(entity_name, entity.get("business_keys"), local_fields, "business_keys", "BUSINESS_KEY")
|
|
921
|
+
|
|
922
|
+
if entity_type == "dimension_table":
|
|
923
|
+
natural_key = str(entity.get("natural_key") or "").strip()
|
|
924
|
+
surrogate_key = str(entity.get("surrogate_key") or "").strip()
|
|
925
|
+
if natural_key and natural_key not in local_fields:
|
|
926
|
+
issues.append(
|
|
927
|
+
Issue(
|
|
928
|
+
severity="error",
|
|
929
|
+
code="NATURAL_KEY_FIELD_NOT_FOUND",
|
|
930
|
+
message=f"Dimension '{entity_name}' natural_key field '{natural_key}' does not exist.",
|
|
931
|
+
path=f"/entities/{entity_name}/natural_key",
|
|
932
|
+
)
|
|
933
|
+
)
|
|
934
|
+
if surrogate_key and surrogate_key not in local_fields:
|
|
935
|
+
issues.append(
|
|
936
|
+
Issue(
|
|
937
|
+
severity="error",
|
|
938
|
+
code="SURROGATE_KEY_FIELD_NOT_FOUND",
|
|
939
|
+
message=f"Dimension '{entity_name}' surrogate_key field '{surrogate_key}' does not exist.",
|
|
940
|
+
path=f"/entities/{entity_name}/surrogate_key",
|
|
941
|
+
)
|
|
942
|
+
)
|
|
943
|
+
if natural_key and surrogate_key and natural_key == surrogate_key:
|
|
944
|
+
issues.append(
|
|
945
|
+
Issue(
|
|
946
|
+
severity="error",
|
|
947
|
+
code="DIMENSION_KEYS_COLLIDE",
|
|
948
|
+
message=f"Dimension '{entity_name}' uses the same field for natural_key and surrogate_key.",
|
|
949
|
+
path=f"/entities/{entity_name}",
|
|
950
|
+
)
|
|
951
|
+
)
|
|
952
|
+
if entity.get("scd_type") == 2 and not surrogate_key:
|
|
953
|
+
issues.append(
|
|
954
|
+
Issue(
|
|
955
|
+
severity="warn",
|
|
956
|
+
code="SCD2_DIMENSION_WITHOUT_SURROGATE_KEY",
|
|
957
|
+
message=f"SCD Type 2 dimension '{entity_name}' should declare surrogate_key.",
|
|
958
|
+
path=f"/entities/{entity_name}/surrogate_key",
|
|
959
|
+
)
|
|
960
|
+
)
|
|
961
|
+
else:
|
|
962
|
+
for prop_name in ("scd_type", "natural_key", "surrogate_key", "conformed"):
|
|
963
|
+
if entity.get(prop_name) not in (None, False, "", []):
|
|
964
|
+
issues.append(
|
|
965
|
+
Issue(
|
|
966
|
+
severity="warn",
|
|
967
|
+
code="DIMENSION_ONLY_PROPERTY_ON_NON_DIMENSION",
|
|
968
|
+
message=f"Entity '{entity_name}' declares '{prop_name}' but is not a dimension_table.",
|
|
969
|
+
path=f"/entities/{entity_name}/{prop_name}",
|
|
970
|
+
)
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
if entity_type == "fact_table":
|
|
974
|
+
for ref_name in entity.get("dimension_refs", []) if isinstance(entity.get("dimension_refs"), list) else []:
|
|
975
|
+
referenced = entities_by_name.get(str(ref_name))
|
|
976
|
+
if referenced and str(referenced.get("type", "")) not in {"dimension_table", "bridge_table"}:
|
|
977
|
+
issues.append(
|
|
978
|
+
Issue(
|
|
979
|
+
severity="warn",
|
|
980
|
+
code="DIMENSION_REF_WRONG_TYPE",
|
|
981
|
+
message=(
|
|
982
|
+
f"Fact table '{entity_name}' dimension_refs entry '{ref_name}' points to "
|
|
983
|
+
f"'{referenced.get('type')}', expected dimension_table or bridge_table."
|
|
984
|
+
),
|
|
985
|
+
path=f"/entities/{entity_name}/dimension_refs",
|
|
986
|
+
)
|
|
987
|
+
)
|
|
988
|
+
elif entity.get("dimension_refs"):
|
|
989
|
+
issues.append(
|
|
990
|
+
Issue(
|
|
991
|
+
severity="warn",
|
|
992
|
+
code="DIMENSION_REFS_ON_NON_FACT",
|
|
993
|
+
message=f"Entity '{entity_name}' declares dimension_refs but is not a fact_table.",
|
|
994
|
+
path=f"/entities/{entity_name}/dimension_refs",
|
|
995
|
+
)
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
if entity_type in DATA_VAULT_ENTITY_TYPES:
|
|
999
|
+
validate_field_property(entity_name, local_fields, "load_timestamp_field", "LOAD_TIMESTAMP")
|
|
1000
|
+
validate_field_property(entity_name, local_fields, "record_source_field", "RECORD_SOURCE")
|
|
1001
|
+
|
|
1002
|
+
if entity_type == "hub":
|
|
1003
|
+
business_keys = entity.get("business_keys")
|
|
1004
|
+
if not isinstance(business_keys, list) or not business_keys:
|
|
1005
|
+
issues.append(
|
|
1006
|
+
Issue(
|
|
1007
|
+
severity="error",
|
|
1008
|
+
code="HUB_MISSING_BUSINESS_KEYS",
|
|
1009
|
+
message=f"Hub '{entity_name}' must declare business_keys.",
|
|
1010
|
+
path=f"/entities/{entity_name}/business_keys",
|
|
1011
|
+
)
|
|
1012
|
+
)
|
|
1013
|
+
validate_field_property(entity_name, local_fields, "hash_key", "HASH_KEY")
|
|
1014
|
+
hash_key = str(entity.get("hash_key") or "").strip()
|
|
1015
|
+
if hash_key and hash_key in local_fields:
|
|
1016
|
+
hash_field = next((field for field in entity.get("fields", []) if field.get("name") == hash_key), {})
|
|
1017
|
+
if not hash_field.get("primary_key"):
|
|
1018
|
+
issues.append(
|
|
1019
|
+
Issue(
|
|
1020
|
+
severity="error",
|
|
1021
|
+
code="HUB_HASH_KEY_NOT_PRIMARY_KEY",
|
|
1022
|
+
message=f"Hub '{entity_name}' hash_key '{hash_key}' should be marked primary_key.",
|
|
1023
|
+
path=f"/entities/{entity_name}/hash_key",
|
|
1024
|
+
)
|
|
1025
|
+
)
|
|
1026
|
+
elif entity_type == "link":
|
|
1027
|
+
link_refs = entity.get("link_refs")
|
|
1028
|
+
if not isinstance(link_refs, list) or len(link_refs) < 2:
|
|
1029
|
+
issues.append(
|
|
1030
|
+
Issue(
|
|
1031
|
+
severity="error",
|
|
1032
|
+
code="LINK_REQUIRES_TWO_REFS",
|
|
1033
|
+
message=f"Link '{entity_name}' must reference at least two hubs in link_refs.",
|
|
1034
|
+
path=f"/entities/{entity_name}/link_refs",
|
|
1035
|
+
)
|
|
1036
|
+
)
|
|
1037
|
+
else:
|
|
1038
|
+
for ref_name in link_refs:
|
|
1039
|
+
referenced = entities_by_name.get(str(ref_name))
|
|
1040
|
+
if referenced is None:
|
|
1041
|
+
issues.append(
|
|
1042
|
+
Issue(
|
|
1043
|
+
severity="error",
|
|
1044
|
+
code="LINK_REF_NOT_FOUND",
|
|
1045
|
+
message=f"Link '{entity_name}' references unknown hub '{ref_name}'.",
|
|
1046
|
+
path=f"/entities/{entity_name}/link_refs",
|
|
1047
|
+
)
|
|
1048
|
+
)
|
|
1049
|
+
elif str(referenced.get("type", "")) != "hub":
|
|
1050
|
+
issues.append(
|
|
1051
|
+
Issue(
|
|
1052
|
+
severity="warn",
|
|
1053
|
+
code="LINK_REF_NOT_HUB",
|
|
1054
|
+
message=f"Link '{entity_name}' references '{ref_name}' which is not a hub.",
|
|
1055
|
+
path=f"/entities/{entity_name}/link_refs",
|
|
1056
|
+
)
|
|
1057
|
+
)
|
|
1058
|
+
validate_field_property(entity_name, local_fields, "hash_key", "HASH_KEY")
|
|
1059
|
+
elif entity_type == "satellite":
|
|
1060
|
+
parent_entity = str(entity.get("parent_entity") or "").strip()
|
|
1061
|
+
if not parent_entity:
|
|
1062
|
+
issues.append(
|
|
1063
|
+
Issue(
|
|
1064
|
+
severity="error",
|
|
1065
|
+
code="SATELLITE_MISSING_PARENT",
|
|
1066
|
+
message=f"Satellite '{entity_name}' must declare parent_entity.",
|
|
1067
|
+
path=f"/entities/{entity_name}/parent_entity",
|
|
1068
|
+
)
|
|
1069
|
+
)
|
|
1070
|
+
else:
|
|
1071
|
+
parent = entities_by_name.get(parent_entity)
|
|
1072
|
+
if parent is None:
|
|
1073
|
+
issues.append(
|
|
1074
|
+
Issue(
|
|
1075
|
+
severity="error",
|
|
1076
|
+
code="SATELLITE_PARENT_NOT_FOUND",
|
|
1077
|
+
message=f"Satellite '{entity_name}' references unknown parent_entity '{parent_entity}'.",
|
|
1078
|
+
path=f"/entities/{entity_name}/parent_entity",
|
|
1079
|
+
)
|
|
1080
|
+
)
|
|
1081
|
+
elif str(parent.get("type", "")) not in {"hub", "link"}:
|
|
1082
|
+
issues.append(
|
|
1083
|
+
Issue(
|
|
1084
|
+
severity="error",
|
|
1085
|
+
code="SATELLITE_PARENT_WRONG_TYPE",
|
|
1086
|
+
message=f"Satellite '{entity_name}' parent_entity '{parent_entity}' must be a hub or link.",
|
|
1087
|
+
path=f"/entities/{entity_name}/parent_entity",
|
|
1088
|
+
)
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
hash_diff_fields = entity.get("hash_diff_fields")
|
|
1092
|
+
if not isinstance(hash_diff_fields, list) or not hash_diff_fields:
|
|
1093
|
+
issues.append(
|
|
1094
|
+
Issue(
|
|
1095
|
+
severity="error",
|
|
1096
|
+
code="SATELLITE_MISSING_HASH_DIFF_FIELDS",
|
|
1097
|
+
message=f"Satellite '{entity_name}' must declare hash_diff_fields.",
|
|
1098
|
+
path=f"/entities/{entity_name}/hash_diff_fields",
|
|
1099
|
+
)
|
|
1100
|
+
)
|
|
1101
|
+
else:
|
|
1102
|
+
for prop_name in ("business_keys", "hash_key", "link_refs", "parent_entity", "hash_diff_fields", "load_timestamp_field", "record_source_field"):
|
|
1103
|
+
if entity.get(prop_name) not in (None, False, "", []):
|
|
1104
|
+
issues.append(
|
|
1105
|
+
Issue(
|
|
1106
|
+
severity="warn",
|
|
1107
|
+
code="DATA_VAULT_PROPERTY_ON_NON_DV_ENTITY",
|
|
1108
|
+
message=f"Entity '{entity_name}' declares '{prop_name}' but is not a data vault entity.",
|
|
1109
|
+
path=f"/entities/{entity_name}/{prop_name}",
|
|
1110
|
+
)
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
if model_layer == "source" and entity_type in (DIMENSIONAL_ENTITY_TYPES | DATA_VAULT_ENTITY_TYPES):
|
|
1114
|
+
issues.append(
|
|
1115
|
+
Issue(
|
|
1116
|
+
severity="warn",
|
|
1117
|
+
code="SOURCE_LAYER_WITH_MODELING_PRIMITIVE",
|
|
1118
|
+
message=(
|
|
1119
|
+
f"Entity '{entity_name}' uses '{entity_type}' in a source-layer model. "
|
|
1120
|
+
"Source layers typically preserve raw source structures rather than dimensional or data vault primitives."
|
|
1121
|
+
),
|
|
1122
|
+
path=f"/entities/{entity_name}/type",
|
|
1123
|
+
)
|
|
1124
|
+
)
|
|
1125
|
+
if model_layer == "report" and entity_type in DATA_VAULT_ENTITY_TYPES:
|
|
1126
|
+
issues.append(
|
|
1127
|
+
Issue(
|
|
1128
|
+
severity="warn",
|
|
1129
|
+
code="REPORT_LAYER_WITH_DATA_VAULT_ENTITY",
|
|
1130
|
+
message=f"Entity '{entity_name}' uses data vault type '{entity_type}' in a report-layer model.",
|
|
1131
|
+
path=f"/entities/{entity_name}/type",
|
|
1132
|
+
)
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
parent_name = str(entity.get("subtype_of") or "").strip()
|
|
1136
|
+
if parent_name:
|
|
1137
|
+
subtype_graph.setdefault(entity_name, set()).add(parent_name)
|
|
1138
|
+
if parent_name == entity_name:
|
|
1139
|
+
issues.append(
|
|
1140
|
+
Issue(
|
|
1141
|
+
severity="error",
|
|
1142
|
+
code="SUBTYPE_SELF_REFERENCE",
|
|
1143
|
+
message=f"Entity '{entity_name}' cannot subtype itself.",
|
|
1144
|
+
path=f"/entities/{entity_name}/subtype_of",
|
|
1145
|
+
)
|
|
1146
|
+
)
|
|
1147
|
+
elif parent_name in entities_by_name:
|
|
1148
|
+
parent_subtypes = set(str(name) for name in entities_by_name[parent_name].get("subtypes", []) if str(name))
|
|
1149
|
+
if parent_subtypes and entity_name not in parent_subtypes:
|
|
1150
|
+
issues.append(
|
|
1151
|
+
Issue(
|
|
1152
|
+
severity="warn",
|
|
1153
|
+
code="SUBTYPE_NOT_LISTED_ON_PARENT",
|
|
1154
|
+
message=f"Entity '{entity_name}' declares subtype_of '{parent_name}' but the parent does not list it in subtypes.",
|
|
1155
|
+
path=f"/entities/{entity_name}/subtype_of",
|
|
1156
|
+
)
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
for child_name in entity.get("subtypes", []) if isinstance(entity.get("subtypes"), list) else []:
|
|
1160
|
+
child_name = str(child_name)
|
|
1161
|
+
subtype_graph.setdefault(child_name, set()).add(entity_name)
|
|
1162
|
+
if child_name == entity_name:
|
|
1163
|
+
issues.append(
|
|
1164
|
+
Issue(
|
|
1165
|
+
severity="error",
|
|
1166
|
+
code="SUPERTYPE_SELF_REFERENCE",
|
|
1167
|
+
message=f"Entity '{entity_name}' cannot list itself in subtypes.",
|
|
1168
|
+
path=f"/entities/{entity_name}/subtypes",
|
|
1169
|
+
)
|
|
1170
|
+
)
|
|
1171
|
+
continue
|
|
1172
|
+
child = entities_by_name.get(child_name)
|
|
1173
|
+
if child is None:
|
|
1174
|
+
issues.append(
|
|
1175
|
+
Issue(
|
|
1176
|
+
severity="warn",
|
|
1177
|
+
code="SUBTYPE_CHILD_NOT_FOUND",
|
|
1178
|
+
message=f"Entity '{entity_name}' lists unknown subtype '{child_name}'.",
|
|
1179
|
+
path=f"/entities/{entity_name}/subtypes",
|
|
1180
|
+
)
|
|
1181
|
+
)
|
|
1182
|
+
continue
|
|
1183
|
+
if str(child.get("subtype_of") or "") not in {"", entity_name}:
|
|
1184
|
+
issues.append(
|
|
1185
|
+
Issue(
|
|
1186
|
+
severity="warn",
|
|
1187
|
+
code="SUPERTYPE_NOT_LINKED_FROM_CHILD",
|
|
1188
|
+
message=(
|
|
1189
|
+
f"Entity '{entity_name}' lists subtype '{child_name}', but that child points to "
|
|
1190
|
+
f"'{child.get('subtype_of')}' instead."
|
|
1191
|
+
),
|
|
1192
|
+
path=f"/entities/{entity_name}/subtypes",
|
|
1193
|
+
)
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
if any(edges for edges in subtype_graph.values()) and _has_cycle(subtype_graph):
|
|
1197
|
+
issues.append(
|
|
1198
|
+
Issue(
|
|
1199
|
+
severity="error",
|
|
1200
|
+
code="SUBTYPE_CYCLE_DETECTED",
|
|
1201
|
+
message="Subtype/supertype relationships contain a cycle.",
|
|
1202
|
+
path="/entities",
|
|
1203
|
+
)
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
return issues
|
|
1207
|
+
|
|
1208
|
+
|
|
1209
|
+
def lint_issues(model: Dict[str, Any]) -> List[Issue]:
|
|
1210
|
+
model = normalize_model(model)
|
|
1211
|
+
issues: List[Issue] = []
|
|
1212
|
+
|
|
1213
|
+
entities = model.get("entities", [])
|
|
1214
|
+
seen_entities: Set[str] = set()
|
|
1215
|
+
refs = _entity_field_refs(model)
|
|
1216
|
+
entity_field_map = _entity_field_names(model)
|
|
1217
|
+
|
|
1218
|
+
for entity in entities:
|
|
1219
|
+
entity_name = entity.get("name", "")
|
|
1220
|
+
|
|
1221
|
+
if entity_name in seen_entities:
|
|
1222
|
+
issues.append(
|
|
1223
|
+
Issue(
|
|
1224
|
+
severity="error",
|
|
1225
|
+
code="DUPLICATE_ENTITY",
|
|
1226
|
+
message=f"Duplicate entity name '{entity_name}'.",
|
|
1227
|
+
path="/entities",
|
|
1228
|
+
)
|
|
1229
|
+
)
|
|
1230
|
+
else:
|
|
1231
|
+
seen_entities.add(entity_name)
|
|
1232
|
+
|
|
1233
|
+
if entity_name and not PASCAL_CASE.match(entity_name):
|
|
1234
|
+
issues.append(
|
|
1235
|
+
Issue(
|
|
1236
|
+
severity="error",
|
|
1237
|
+
code="INVALID_ENTITY_NAME",
|
|
1238
|
+
message=f"Entity '{entity_name}' must be PascalCase.",
|
|
1239
|
+
path="/entities",
|
|
1240
|
+
)
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
fields = entity.get("fields", [])
|
|
1244
|
+
field_names: Set[str] = set()
|
|
1245
|
+
has_pk = False
|
|
1246
|
+
|
|
1247
|
+
for field in fields:
|
|
1248
|
+
name = field.get("name", "")
|
|
1249
|
+
if name in field_names:
|
|
1250
|
+
issues.append(
|
|
1251
|
+
Issue(
|
|
1252
|
+
severity="error",
|
|
1253
|
+
code="DUPLICATE_FIELD",
|
|
1254
|
+
message=f"Duplicate field '{name}' in entity '{entity_name}'.",
|
|
1255
|
+
path=f"/entities/{entity_name}/fields",
|
|
1256
|
+
)
|
|
1257
|
+
)
|
|
1258
|
+
else:
|
|
1259
|
+
field_names.add(name)
|
|
1260
|
+
|
|
1261
|
+
if name and not SNAKE_CASE.match(name):
|
|
1262
|
+
issues.append(
|
|
1263
|
+
Issue(
|
|
1264
|
+
severity="error",
|
|
1265
|
+
code="INVALID_FIELD_NAME",
|
|
1266
|
+
message=f"Field '{entity_name}.{name}' must be snake_case.",
|
|
1267
|
+
path=f"/entities/{entity_name}/fields",
|
|
1268
|
+
)
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
if field.get("primary_key") is True:
|
|
1272
|
+
has_pk = True
|
|
1273
|
+
|
|
1274
|
+
if field.get("computed") is True and not field.get("computed_expression"):
|
|
1275
|
+
issues.append(
|
|
1276
|
+
Issue(
|
|
1277
|
+
severity="warn",
|
|
1278
|
+
code="MISSING_COMPUTED_EXPRESSION",
|
|
1279
|
+
message=f"Computed field '{entity_name}.{name}' should have a computed_expression.",
|
|
1280
|
+
path=f"/entities/{entity_name}/fields",
|
|
1281
|
+
)
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
if field.get("deprecated") is True:
|
|
1285
|
+
issues.append(
|
|
1286
|
+
Issue(
|
|
1287
|
+
severity="warn",
|
|
1288
|
+
code="DEPRECATED_FIELD",
|
|
1289
|
+
message=f"Field '{entity_name}.{name}' is deprecated."
|
|
1290
|
+
+ (f" {field['deprecated_message']}" if field.get("deprecated_message") else ""),
|
|
1291
|
+
path=f"/entities/{entity_name}/fields",
|
|
1292
|
+
)
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
entity_type = entity.get("type", "table")
|
|
1296
|
+
if entity_type in PK_REQUIRED_TYPES and not has_pk:
|
|
1297
|
+
issues.append(
|
|
1298
|
+
Issue(
|
|
1299
|
+
severity="warn",
|
|
1300
|
+
code="MISSING_PRIMARY_KEY",
|
|
1301
|
+
message=f"Table '{entity_name}' must have at least one primary key field.",
|
|
1302
|
+
path=f"/entities/{entity_name}",
|
|
1303
|
+
)
|
|
1304
|
+
)
|
|
1305
|
+
|
|
1306
|
+
# dimension_refs: warn if a referenced dimension entity is not in this model
|
|
1307
|
+
dim_refs = entity.get("dimension_refs", [])
|
|
1308
|
+
has_imports = bool(model.get("model", {}).get("imports"))
|
|
1309
|
+
if isinstance(dim_refs, list):
|
|
1310
|
+
for ref_name in dim_refs:
|
|
1311
|
+
if ref_name and ref_name not in entity_field_map:
|
|
1312
|
+
issues.append(
|
|
1313
|
+
Issue(
|
|
1314
|
+
severity="warn",
|
|
1315
|
+
code="DIMENSION_REF_NOT_FOUND",
|
|
1316
|
+
message=(
|
|
1317
|
+
f"Fact table '{entity_name}' references dimension '{ref_name}' "
|
|
1318
|
+
f"which is not defined in this model."
|
|
1319
|
+
+ (" (may be in an imported model)" if has_imports else "")
|
|
1320
|
+
),
|
|
1321
|
+
path=f"/entities/{entity_name}/dimension_refs",
|
|
1322
|
+
)
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
for rel in model.get("relationships", []):
|
|
1326
|
+
from_ref = rel.get("from", "")
|
|
1327
|
+
to_ref = rel.get("to", "")
|
|
1328
|
+
name = rel.get("name", "<unnamed>")
|
|
1329
|
+
|
|
1330
|
+
if from_ref and not REL_REF.match(from_ref):
|
|
1331
|
+
issues.append(
|
|
1332
|
+
Issue(
|
|
1333
|
+
severity="error",
|
|
1334
|
+
code="INVALID_RELATIONSHIP_REF",
|
|
1335
|
+
message=f"Relationship '{name}' has invalid 'from' reference '{from_ref}'.",
|
|
1336
|
+
path="/relationships",
|
|
1337
|
+
)
|
|
1338
|
+
)
|
|
1339
|
+
if to_ref and not REL_REF.match(to_ref):
|
|
1340
|
+
issues.append(
|
|
1341
|
+
Issue(
|
|
1342
|
+
severity="error",
|
|
1343
|
+
code="INVALID_RELATIONSHIP_REF",
|
|
1344
|
+
message=f"Relationship '{name}' has invalid 'to' reference '{to_ref}'.",
|
|
1345
|
+
path="/relationships",
|
|
1346
|
+
)
|
|
1347
|
+
)
|
|
1348
|
+
if from_ref and from_ref not in refs:
|
|
1349
|
+
issues.append(
|
|
1350
|
+
Issue(
|
|
1351
|
+
severity="error",
|
|
1352
|
+
code="RELATIONSHIP_REF_NOT_FOUND",
|
|
1353
|
+
message=f"Relationship '{name}' from reference '{from_ref}' does not exist.",
|
|
1354
|
+
path="/relationships",
|
|
1355
|
+
)
|
|
1356
|
+
)
|
|
1357
|
+
if to_ref and to_ref not in refs:
|
|
1358
|
+
issues.append(
|
|
1359
|
+
Issue(
|
|
1360
|
+
severity="error",
|
|
1361
|
+
code="RELATIONSHIP_REF_NOT_FOUND",
|
|
1362
|
+
message=f"Relationship '{name}' to reference '{to_ref}' does not exist.",
|
|
1363
|
+
path="/relationships",
|
|
1364
|
+
)
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1367
|
+
classification = model.get("governance", {}).get("classification", {})
|
|
1368
|
+
if isinstance(classification, dict):
|
|
1369
|
+
for target, value in classification.items():
|
|
1370
|
+
if target not in refs:
|
|
1371
|
+
issues.append(
|
|
1372
|
+
Issue(
|
|
1373
|
+
severity="error",
|
|
1374
|
+
code="CLASSIFICATION_REF_NOT_FOUND",
|
|
1375
|
+
message=f"Classification target '{target}' does not exist.",
|
|
1376
|
+
path="/governance/classification",
|
|
1377
|
+
)
|
|
1378
|
+
)
|
|
1379
|
+
if value not in ALLOWED_CLASSIFICATIONS:
|
|
1380
|
+
issues.append(
|
|
1381
|
+
Issue(
|
|
1382
|
+
severity="error",
|
|
1383
|
+
code="INVALID_CLASSIFICATION",
|
|
1384
|
+
message=f"Classification '{value}' is not allowed.",
|
|
1385
|
+
path="/governance/classification",
|
|
1386
|
+
)
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
issues.extend(_lint_indexes(model, entity_field_map))
|
|
1390
|
+
issues.extend(_lint_glossary(model, refs))
|
|
1391
|
+
issues.extend(_lint_grain_and_metrics(model, entity_field_map))
|
|
1392
|
+
issues.extend(_lint_smart_nudges(model, entity_field_map, refs))
|
|
1393
|
+
issues.extend(_lint_phase1_modeling_core(model, entity_field_map))
|
|
1394
|
+
issues.extend(_lint_modeling_libraries(model, entity_field_map))
|
|
1395
|
+
|
|
1396
|
+
graph = _relationship_graph(model)
|
|
1397
|
+
if graph and _has_cycle(graph):
|
|
1398
|
+
issues.append(
|
|
1399
|
+
Issue(
|
|
1400
|
+
severity="warn",
|
|
1401
|
+
code="CIRCULAR_RELATIONSHIPS",
|
|
1402
|
+
message="Circular entity relationships detected.",
|
|
1403
|
+
path="/relationships",
|
|
1404
|
+
)
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
return issues
|
|
1408
|
+
|
|
1409
|
+
|
|
1410
|
+
def _lint_modeling_libraries(model: Dict[str, Any], entity_field_map: Dict[str, Set[str]]) -> List[Issue]:
|
|
1411
|
+
issues: List[Issue] = []
|
|
1412
|
+
domain_names = {
|
|
1413
|
+
str(domain.get("name") or "")
|
|
1414
|
+
for domain in model.get("domains", [])
|
|
1415
|
+
if isinstance(domain, dict) and domain.get("name")
|
|
1416
|
+
}
|
|
1417
|
+
template_names = {
|
|
1418
|
+
str(template.get("name") or "")
|
|
1419
|
+
for template in model.get("templates", [])
|
|
1420
|
+
if isinstance(template, dict) and template.get("name")
|
|
1421
|
+
}
|
|
1422
|
+
subject_areas = {
|
|
1423
|
+
str(subject_area.get("name") or "")
|
|
1424
|
+
for subject_area in model.get("subject_areas", [])
|
|
1425
|
+
if isinstance(subject_area, dict) and subject_area.get("name")
|
|
1426
|
+
}
|
|
1427
|
+
kind = str(model.get("model", {}).get("kind", "physical"))
|
|
1428
|
+
|
|
1429
|
+
for entity in model.get("entities", []):
|
|
1430
|
+
if not isinstance(entity, dict):
|
|
1431
|
+
continue
|
|
1432
|
+
entity_name = str(entity.get("name", ""))
|
|
1433
|
+
entity_type = str(entity.get("type", ""))
|
|
1434
|
+
if kind == "conceptual" and entity_type != "concept":
|
|
1435
|
+
issues.append(
|
|
1436
|
+
Issue(
|
|
1437
|
+
severity="warn",
|
|
1438
|
+
code="CONCEPTUAL_KIND_WITH_PHYSICAL_ENTITY",
|
|
1439
|
+
message=f"Conceptual model '{entity_name}' should generally use type 'concept'.",
|
|
1440
|
+
path=f"/entities/{entity_name}/type",
|
|
1441
|
+
)
|
|
1442
|
+
)
|
|
1443
|
+
if kind == "logical" and entity_type == "concept":
|
|
1444
|
+
issues.append(
|
|
1445
|
+
Issue(
|
|
1446
|
+
severity="warn",
|
|
1447
|
+
code="LOGICAL_KIND_WITH_CONCEPT_ENTITY",
|
|
1448
|
+
message=f"Logical model '{entity_name}' should generally use type 'logical_entity'.",
|
|
1449
|
+
path=f"/entities/{entity_name}/type",
|
|
1450
|
+
)
|
|
1451
|
+
)
|
|
1452
|
+
if kind == "logical" and entity_type in {"table", "view", "materialized_view", "external_table", "snapshot"}:
|
|
1453
|
+
issues.append(
|
|
1454
|
+
Issue(
|
|
1455
|
+
severity="warn",
|
|
1456
|
+
code="LOGICAL_KIND_WITH_PHYSICAL_ENTITY",
|
|
1457
|
+
message=f"Logical model '{entity_name}' uses physical entity type '{entity_type}'.",
|
|
1458
|
+
path=f"/entities/{entity_name}/type",
|
|
1459
|
+
)
|
|
1460
|
+
)
|
|
1461
|
+
if kind == "physical" and entity_type in {"concept", "logical_entity"}:
|
|
1462
|
+
issues.append(
|
|
1463
|
+
Issue(
|
|
1464
|
+
severity="warn",
|
|
1465
|
+
code="PHYSICAL_KIND_WITH_ABSTRACT_ENTITY",
|
|
1466
|
+
message=f"Physical model '{entity_name}' should use a physical/dimensional/data-vault entity type.",
|
|
1467
|
+
path=f"/entities/{entity_name}/type",
|
|
1468
|
+
)
|
|
1469
|
+
)
|
|
1470
|
+
area = str(entity.get("subject_area") or "")
|
|
1471
|
+
if area and subject_areas and area not in subject_areas:
|
|
1472
|
+
issues.append(
|
|
1473
|
+
Issue(
|
|
1474
|
+
severity="warn",
|
|
1475
|
+
code="SUBJECT_AREA_NOT_FOUND",
|
|
1476
|
+
message=f"Entity '{entity_name}' references undefined subject area '{area}'.",
|
|
1477
|
+
path=f"/entities/{entity_name}/subject_area",
|
|
1478
|
+
)
|
|
1479
|
+
)
|
|
1480
|
+
template_values = []
|
|
1481
|
+
if entity.get("template"):
|
|
1482
|
+
template_values.append(str(entity.get("template")))
|
|
1483
|
+
template_values.extend(str(item) for item in entity.get("templates", []) if str(item))
|
|
1484
|
+
for template_name in template_values:
|
|
1485
|
+
if template_name not in template_names:
|
|
1486
|
+
issues.append(
|
|
1487
|
+
Issue(
|
|
1488
|
+
severity="warn",
|
|
1489
|
+
code="TEMPLATE_NOT_FOUND",
|
|
1490
|
+
message=f"Entity '{entity_name}' references unknown template '{template_name}'.",
|
|
1491
|
+
path=f"/entities/{entity_name}/templates",
|
|
1492
|
+
)
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
fields = entity.get("fields", [])
|
|
1496
|
+
local_fields = entity_field_map.get(entity_name, set())
|
|
1497
|
+
for field in fields:
|
|
1498
|
+
if not isinstance(field, dict):
|
|
1499
|
+
continue
|
|
1500
|
+
field_name = str(field.get("name", ""))
|
|
1501
|
+
domain_name = str(field.get("domain") or "")
|
|
1502
|
+
if domain_name and domain_name not in domain_names:
|
|
1503
|
+
issues.append(
|
|
1504
|
+
Issue(
|
|
1505
|
+
severity="warn",
|
|
1506
|
+
code="DOMAIN_NOT_FOUND",
|
|
1507
|
+
message=f"Field '{entity_name}.{field_name}' references unknown domain '{domain_name}'.",
|
|
1508
|
+
path=f"/entities/{entity_name}/fields/{field_name}/domain",
|
|
1509
|
+
)
|
|
1510
|
+
)
|
|
1511
|
+
if entity.get("subtype_of") and str(entity.get("subtype_of")) not in entity_field_map:
|
|
1512
|
+
issues.append(
|
|
1513
|
+
Issue(
|
|
1514
|
+
severity="warn",
|
|
1515
|
+
code="SUBTYPE_PARENT_NOT_FOUND",
|
|
1516
|
+
message=f"Entity '{entity_name}' references unknown subtype_of parent '{entity.get('subtype_of')}'.",
|
|
1517
|
+
path=f"/entities/{entity_name}/subtype_of",
|
|
1518
|
+
)
|
|
1519
|
+
)
|
|
1520
|
+
if entity.get("partition_by"):
|
|
1521
|
+
for field_name in entity.get("partition_by", []):
|
|
1522
|
+
if field_name not in local_fields:
|
|
1523
|
+
issues.append(
|
|
1524
|
+
Issue(
|
|
1525
|
+
severity="error",
|
|
1526
|
+
code="PARTITION_FIELD_NOT_FOUND",
|
|
1527
|
+
message=f"Entity '{entity_name}' partition_by field '{field_name}' does not exist.",
|
|
1528
|
+
path=f"/entities/{entity_name}/partition_by",
|
|
1529
|
+
)
|
|
1530
|
+
)
|
|
1531
|
+
if entity_type in {"concept", "logical_entity"}:
|
|
1532
|
+
issues.append(
|
|
1533
|
+
Issue(
|
|
1534
|
+
severity="warn",
|
|
1535
|
+
code="PHYSICAL_ONLY_PROPERTY_IN_NON_PHYSICAL_MODEL",
|
|
1536
|
+
message=f"Entity '{entity_name}' declares partition_by in a non-physical entity type.",
|
|
1537
|
+
path=f"/entities/{entity_name}/partition_by",
|
|
1538
|
+
)
|
|
1539
|
+
)
|
|
1540
|
+
if entity.get("cluster_by"):
|
|
1541
|
+
for field_name in entity.get("cluster_by", []):
|
|
1542
|
+
if field_name not in local_fields:
|
|
1543
|
+
issues.append(
|
|
1544
|
+
Issue(
|
|
1545
|
+
severity="error",
|
|
1546
|
+
code="CLUSTER_FIELD_NOT_FOUND",
|
|
1547
|
+
message=f"Entity '{entity_name}' cluster_by field '{field_name}' does not exist.",
|
|
1548
|
+
path=f"/entities/{entity_name}/cluster_by",
|
|
1549
|
+
)
|
|
1550
|
+
)
|
|
1551
|
+
if entity_type in {"concept", "logical_entity"}:
|
|
1552
|
+
issues.append(
|
|
1553
|
+
Issue(
|
|
1554
|
+
severity="warn",
|
|
1555
|
+
code="PHYSICAL_ONLY_PROPERTY_IN_NON_PHYSICAL_MODEL",
|
|
1556
|
+
message=f"Entity '{entity_name}' declares cluster_by in a non-physical entity type.",
|
|
1557
|
+
path=f"/entities/{entity_name}/cluster_by",
|
|
1558
|
+
)
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1561
|
+
return issues
|