odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Explanation Quality Linter
|
|
3
|
+
===========================
|
|
4
|
+
|
|
5
|
+
Validates that explanations meet Odibi quality standards.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LintIssue:
|
|
15
|
+
"""A linting issue found in an explanation."""
|
|
16
|
+
|
|
17
|
+
severity: str # "error", "warning", "info"
|
|
18
|
+
message: str
|
|
19
|
+
rule: str
|
|
20
|
+
|
|
21
|
+
def __str__(self):
|
|
22
|
+
symbol = {"error": "❌", "warning": "⚠️", "info": "ℹ️"}[self.severity]
|
|
23
|
+
return f"{symbol} {self.message} [{self.rule}]"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ExplanationLinter:
|
|
27
|
+
"""
|
|
28
|
+
Lints explanation text for quality issues.
|
|
29
|
+
|
|
30
|
+
Checks:
|
|
31
|
+
- Minimum length
|
|
32
|
+
- Required sections (Purpose, Details, Result)
|
|
33
|
+
- Generic/lazy phrases
|
|
34
|
+
- TODO placeholders
|
|
35
|
+
- Formula formatting
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
REQUIRED_SECTIONS = ["Purpose", "Details", "Result"]
|
|
39
|
+
|
|
40
|
+
LAZY_PHRASES = [
|
|
41
|
+
"calculates stuff",
|
|
42
|
+
"does things",
|
|
43
|
+
"processes data",
|
|
44
|
+
"handles records",
|
|
45
|
+
"TODO",
|
|
46
|
+
"[placeholder]",
|
|
47
|
+
"TBD",
|
|
48
|
+
"to be determined",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
MIN_LENGTH = 50 # characters
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.issues: List[LintIssue] = []
|
|
55
|
+
|
|
56
|
+
def lint(self, explanation: str, operation_name: str = "unknown") -> List[LintIssue]:
|
|
57
|
+
"""
|
|
58
|
+
Lint an explanation and return issues.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
explanation: The explanation text
|
|
62
|
+
operation_name: Name of the operation (for error messages)
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of LintIssue objects
|
|
66
|
+
"""
|
|
67
|
+
self.issues = []
|
|
68
|
+
|
|
69
|
+
if not explanation or not explanation.strip():
|
|
70
|
+
self.issues.append(
|
|
71
|
+
LintIssue(
|
|
72
|
+
severity="error",
|
|
73
|
+
message=f"Explanation for '{operation_name}' is empty",
|
|
74
|
+
rule="E001",
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
return self.issues
|
|
78
|
+
|
|
79
|
+
# Check length
|
|
80
|
+
self._check_length(explanation, operation_name)
|
|
81
|
+
|
|
82
|
+
# Check required sections
|
|
83
|
+
self._check_required_sections(explanation, operation_name)
|
|
84
|
+
|
|
85
|
+
# Check for lazy phrases
|
|
86
|
+
self._check_lazy_phrases(explanation, operation_name)
|
|
87
|
+
|
|
88
|
+
# Check formula formatting
|
|
89
|
+
self._check_formula_formatting(explanation, operation_name)
|
|
90
|
+
|
|
91
|
+
return self.issues
|
|
92
|
+
|
|
93
|
+
def _check_length(self, text: str, op_name: str):
|
|
94
|
+
"""Check minimum length requirement."""
|
|
95
|
+
if len(text.strip()) < self.MIN_LENGTH:
|
|
96
|
+
self.issues.append(
|
|
97
|
+
LintIssue(
|
|
98
|
+
severity="error",
|
|
99
|
+
message=f"Explanation for '{op_name}' too short ({len(text.strip())} chars, minimum {self.MIN_LENGTH})",
|
|
100
|
+
rule="E002",
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _check_required_sections(self, text: str, op_name: str):
|
|
105
|
+
"""Check for required sections."""
|
|
106
|
+
for section in self.REQUIRED_SECTIONS:
|
|
107
|
+
pattern = f"\\*\\*{section}:?\\*\\*"
|
|
108
|
+
if not re.search(pattern, text, re.IGNORECASE):
|
|
109
|
+
self.issues.append(
|
|
110
|
+
LintIssue(
|
|
111
|
+
severity="error",
|
|
112
|
+
message=f"Explanation for '{op_name}' missing required section: {section}",
|
|
113
|
+
rule="E003",
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _check_lazy_phrases(self, text: str, op_name: str):
|
|
118
|
+
"""Check for generic/lazy phrases."""
|
|
119
|
+
text_lower = text.lower()
|
|
120
|
+
for phrase in self.LAZY_PHRASES:
|
|
121
|
+
if phrase.lower() in text_lower:
|
|
122
|
+
self.issues.append(
|
|
123
|
+
LintIssue(
|
|
124
|
+
severity="error",
|
|
125
|
+
message=f"Explanation for '{op_name}' contains generic phrase: '{phrase}'",
|
|
126
|
+
rule="E004",
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def _check_formula_formatting(self, text: str, op_name: str):
|
|
131
|
+
"""Check formula formatting."""
|
|
132
|
+
# If mentions "formula" but no code block
|
|
133
|
+
if "formula" in text.lower():
|
|
134
|
+
if "```" not in text:
|
|
135
|
+
self.issues.append(
|
|
136
|
+
LintIssue(
|
|
137
|
+
severity="warning",
|
|
138
|
+
message=f"Explanation for '{op_name}' mentions formula but no code block found",
|
|
139
|
+
rule="W001",
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def has_errors(self) -> bool:
|
|
144
|
+
"""Check if any errors were found."""
|
|
145
|
+
return any(issue.severity == "error" for issue in self.issues)
|
|
146
|
+
|
|
147
|
+
def format_issues(self) -> str:
|
|
148
|
+
"""Format all issues as string."""
|
|
149
|
+
if not self.issues:
|
|
150
|
+
return "✅ No issues found"
|
|
151
|
+
|
|
152
|
+
lines = []
|
|
153
|
+
for issue in self.issues:
|
|
154
|
+
lines.append(str(issue))
|
|
155
|
+
return "\n".join(lines)
|
odibi/validation/fk.py
ADDED
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Foreign Key Validation Module
|
|
3
|
+
=============================
|
|
4
|
+
|
|
5
|
+
Declare and validate referential integrity between fact and dimension tables.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Declare relationships in YAML
|
|
9
|
+
- Validate referential integrity on fact load
|
|
10
|
+
- Detect orphan records
|
|
11
|
+
- Generate lineage from relationships
|
|
12
|
+
- Integration with FactPattern
|
|
13
|
+
|
|
14
|
+
Example Config:
|
|
15
|
+
relationships:
|
|
16
|
+
- name: orders_to_customers
|
|
17
|
+
fact: fact_orders
|
|
18
|
+
dimension: dim_customer
|
|
19
|
+
fact_key: customer_sk
|
|
20
|
+
dimension_key: customer_sk
|
|
21
|
+
|
|
22
|
+
- name: orders_to_products
|
|
23
|
+
fact: fact_orders
|
|
24
|
+
dimension: dim_product
|
|
25
|
+
fact_key: product_sk
|
|
26
|
+
dimension_key: product_sk
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import time
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from typing import Any, Dict, List, Optional
|
|
32
|
+
|
|
33
|
+
from pydantic import BaseModel, Field, field_validator
|
|
34
|
+
|
|
35
|
+
from odibi.context import EngineContext
|
|
36
|
+
from odibi.enums import EngineType
|
|
37
|
+
from odibi.utils.logging_context import get_logging_context
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RelationshipConfig(BaseModel):
|
|
41
|
+
"""
|
|
42
|
+
Configuration for a foreign key relationship.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
name: Unique relationship identifier
|
|
46
|
+
fact: Fact table name
|
|
47
|
+
dimension: Dimension table name
|
|
48
|
+
fact_key: Foreign key column in fact table
|
|
49
|
+
dimension_key: Primary/surrogate key column in dimension
|
|
50
|
+
nullable: Whether nulls are allowed in fact_key
|
|
51
|
+
on_violation: Action on violation ("warn", "error", "quarantine")
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
name: str = Field(..., description="Unique relationship identifier")
|
|
55
|
+
fact: str = Field(..., description="Fact table name")
|
|
56
|
+
dimension: str = Field(..., description="Dimension table name")
|
|
57
|
+
fact_key: str = Field(..., description="FK column in fact table")
|
|
58
|
+
dimension_key: str = Field(..., description="PK/SK column in dimension")
|
|
59
|
+
nullable: bool = Field(default=False, description="Allow nulls in fact_key")
|
|
60
|
+
on_violation: str = Field(default="error", description="Action on violation")
|
|
61
|
+
|
|
62
|
+
@field_validator("name", "fact", "dimension", "fact_key", "dimension_key")
|
|
63
|
+
@classmethod
|
|
64
|
+
def validate_not_empty(cls, v: str, info) -> str:
|
|
65
|
+
if not v or not v.strip():
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"RelationshipConfig.{info.field_name} cannot be empty. "
|
|
68
|
+
f"Got: {v!r}. Provide a non-empty string value."
|
|
69
|
+
)
|
|
70
|
+
return v.strip()
|
|
71
|
+
|
|
72
|
+
@field_validator("on_violation")
|
|
73
|
+
@classmethod
|
|
74
|
+
def validate_on_violation(cls, v: str) -> str:
|
|
75
|
+
valid = ("warn", "error", "quarantine")
|
|
76
|
+
if v.lower() not in valid:
|
|
77
|
+
raise ValueError(f"Invalid on_violation value. Expected one of {valid}, got: {v!r}.")
|
|
78
|
+
return v.lower()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class RelationshipRegistry(BaseModel):
|
|
82
|
+
"""
|
|
83
|
+
Registry of all declared relationships.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
relationships: List of relationship configurations
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
relationships: List[RelationshipConfig] = Field(
|
|
90
|
+
default_factory=list, description="Relationship definitions"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def get_relationship(self, name: str) -> Optional[RelationshipConfig]:
|
|
94
|
+
"""Get a relationship by name."""
|
|
95
|
+
for rel in self.relationships:
|
|
96
|
+
if rel.name.lower() == name.lower():
|
|
97
|
+
return rel
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
def get_fact_relationships(self, fact_table: str) -> List[RelationshipConfig]:
|
|
101
|
+
"""Get all relationships for a fact table."""
|
|
102
|
+
return [rel for rel in self.relationships if rel.fact.lower() == fact_table.lower()]
|
|
103
|
+
|
|
104
|
+
def get_dimension_relationships(self, dim_table: str) -> List[RelationshipConfig]:
|
|
105
|
+
"""Get all relationships referencing a dimension."""
|
|
106
|
+
return [rel for rel in self.relationships if rel.dimension.lower() == dim_table.lower()]
|
|
107
|
+
|
|
108
|
+
def generate_lineage(self) -> Dict[str, List[str]]:
|
|
109
|
+
"""
|
|
110
|
+
Generate lineage map from relationships.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dict mapping fact tables to their dimension dependencies
|
|
114
|
+
"""
|
|
115
|
+
lineage: Dict[str, List[str]] = {}
|
|
116
|
+
for rel in self.relationships:
|
|
117
|
+
if rel.fact not in lineage:
|
|
118
|
+
lineage[rel.fact] = []
|
|
119
|
+
if rel.dimension not in lineage[rel.fact]:
|
|
120
|
+
lineage[rel.fact].append(rel.dimension)
|
|
121
|
+
return lineage
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class OrphanRecord:
|
|
126
|
+
"""Details of an orphan record."""
|
|
127
|
+
|
|
128
|
+
fact_key_value: Any
|
|
129
|
+
fact_key_column: str
|
|
130
|
+
dimension_table: str
|
|
131
|
+
row_index: Optional[int] = None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class FKValidationResult:
|
|
136
|
+
"""Result of FK validation."""
|
|
137
|
+
|
|
138
|
+
relationship_name: str
|
|
139
|
+
valid: bool
|
|
140
|
+
total_rows: int
|
|
141
|
+
orphan_count: int
|
|
142
|
+
null_count: int
|
|
143
|
+
orphan_values: List[Any] = field(default_factory=list)
|
|
144
|
+
elapsed_ms: float = 0.0
|
|
145
|
+
error: Optional[str] = None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class FKValidationReport:
|
|
150
|
+
"""Complete FK validation report for a fact table."""
|
|
151
|
+
|
|
152
|
+
fact_table: str
|
|
153
|
+
all_valid: bool
|
|
154
|
+
total_relationships: int
|
|
155
|
+
valid_relationships: int
|
|
156
|
+
results: List[FKValidationResult] = field(default_factory=list)
|
|
157
|
+
orphan_records: List[OrphanRecord] = field(default_factory=list)
|
|
158
|
+
elapsed_ms: float = 0.0
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class FKValidator:
|
|
162
|
+
"""
|
|
163
|
+
Validate foreign key relationships between fact and dimension tables.
|
|
164
|
+
|
|
165
|
+
Usage:
|
|
166
|
+
registry = RelationshipRegistry(relationships=[...])
|
|
167
|
+
validator = FKValidator(registry)
|
|
168
|
+
report = validator.validate_fact(fact_df, "fact_orders", context)
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self, registry: RelationshipRegistry):
|
|
172
|
+
"""
|
|
173
|
+
Initialize with relationship registry.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
registry: RelationshipRegistry with relationship definitions
|
|
177
|
+
"""
|
|
178
|
+
self.registry = registry
|
|
179
|
+
|
|
180
|
+
def validate_relationship(
|
|
181
|
+
self,
|
|
182
|
+
fact_df: Any,
|
|
183
|
+
relationship: RelationshipConfig,
|
|
184
|
+
context: EngineContext,
|
|
185
|
+
) -> FKValidationResult:
|
|
186
|
+
"""
|
|
187
|
+
Validate a single FK relationship.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
fact_df: Fact DataFrame to validate
|
|
191
|
+
relationship: Relationship configuration
|
|
192
|
+
context: EngineContext with dimension data
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
FKValidationResult with validation details
|
|
196
|
+
"""
|
|
197
|
+
ctx = get_logging_context()
|
|
198
|
+
start_time = time.time()
|
|
199
|
+
|
|
200
|
+
ctx.debug(
|
|
201
|
+
"Validating FK relationship",
|
|
202
|
+
relationship=relationship.name,
|
|
203
|
+
fact=relationship.fact,
|
|
204
|
+
dimension=relationship.dimension,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
dim_df = context.get(relationship.dimension)
|
|
209
|
+
except KeyError:
|
|
210
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
211
|
+
return FKValidationResult(
|
|
212
|
+
relationship_name=relationship.name,
|
|
213
|
+
valid=False,
|
|
214
|
+
total_rows=0,
|
|
215
|
+
orphan_count=0,
|
|
216
|
+
null_count=0,
|
|
217
|
+
elapsed_ms=elapsed_ms,
|
|
218
|
+
error=f"Dimension table '{relationship.dimension}' not found",
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
if context.engine_type == EngineType.SPARK:
|
|
223
|
+
result = self._validate_spark(fact_df, dim_df, relationship)
|
|
224
|
+
else:
|
|
225
|
+
result = self._validate_pandas(fact_df, dim_df, relationship)
|
|
226
|
+
|
|
227
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
228
|
+
result.elapsed_ms = elapsed_ms
|
|
229
|
+
|
|
230
|
+
if result.valid:
|
|
231
|
+
ctx.debug(
|
|
232
|
+
"FK validation passed",
|
|
233
|
+
relationship=relationship.name,
|
|
234
|
+
total_rows=result.total_rows,
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
ctx.warning(
|
|
238
|
+
"FK validation failed",
|
|
239
|
+
relationship=relationship.name,
|
|
240
|
+
orphan_count=result.orphan_count,
|
|
241
|
+
null_count=result.null_count,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return result
|
|
245
|
+
|
|
246
|
+
except Exception as e:
|
|
247
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
248
|
+
ctx.error(
|
|
249
|
+
f"FK validation error: {e}",
|
|
250
|
+
relationship=relationship.name,
|
|
251
|
+
)
|
|
252
|
+
return FKValidationResult(
|
|
253
|
+
relationship_name=relationship.name,
|
|
254
|
+
valid=False,
|
|
255
|
+
total_rows=0,
|
|
256
|
+
orphan_count=0,
|
|
257
|
+
null_count=0,
|
|
258
|
+
elapsed_ms=elapsed_ms,
|
|
259
|
+
error=str(e),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
def _validate_spark(
|
|
263
|
+
self,
|
|
264
|
+
fact_df: Any,
|
|
265
|
+
dim_df: Any,
|
|
266
|
+
relationship: RelationshipConfig,
|
|
267
|
+
) -> FKValidationResult:
|
|
268
|
+
"""Validate using Spark."""
|
|
269
|
+
from pyspark.sql import functions as F
|
|
270
|
+
|
|
271
|
+
fk_col = relationship.fact_key
|
|
272
|
+
dk_col = relationship.dimension_key
|
|
273
|
+
|
|
274
|
+
total_rows = fact_df.count()
|
|
275
|
+
|
|
276
|
+
null_count = fact_df.filter(F.col(fk_col).isNull()).count()
|
|
277
|
+
|
|
278
|
+
dim_keys = dim_df.select(F.col(dk_col).alias("_dim_key")).distinct()
|
|
279
|
+
|
|
280
|
+
non_null_facts = fact_df.filter(F.col(fk_col).isNotNull())
|
|
281
|
+
orphans = non_null_facts.join(
|
|
282
|
+
dim_keys,
|
|
283
|
+
non_null_facts[fk_col] == dim_keys["_dim_key"],
|
|
284
|
+
"left_anti",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
orphan_count = orphans.count()
|
|
288
|
+
|
|
289
|
+
orphan_values = []
|
|
290
|
+
if orphan_count > 0 and orphan_count <= 100:
|
|
291
|
+
orphan_values = [
|
|
292
|
+
row[fk_col] for row in orphans.select(fk_col).distinct().limit(100).collect()
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
is_valid = orphan_count == 0 and (relationship.nullable or null_count == 0)
|
|
296
|
+
|
|
297
|
+
return FKValidationResult(
|
|
298
|
+
relationship_name=relationship.name,
|
|
299
|
+
valid=is_valid,
|
|
300
|
+
total_rows=total_rows,
|
|
301
|
+
orphan_count=orphan_count,
|
|
302
|
+
null_count=null_count,
|
|
303
|
+
orphan_values=orphan_values,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def _validate_pandas(
|
|
307
|
+
self,
|
|
308
|
+
fact_df: Any,
|
|
309
|
+
dim_df: Any,
|
|
310
|
+
relationship: RelationshipConfig,
|
|
311
|
+
) -> FKValidationResult:
|
|
312
|
+
"""Validate using Pandas."""
|
|
313
|
+
|
|
314
|
+
fk_col = relationship.fact_key
|
|
315
|
+
dk_col = relationship.dimension_key
|
|
316
|
+
|
|
317
|
+
total_rows = len(fact_df)
|
|
318
|
+
|
|
319
|
+
null_count = int(fact_df[fk_col].isna().sum())
|
|
320
|
+
|
|
321
|
+
dim_keys = set(dim_df[dk_col].dropna().unique())
|
|
322
|
+
|
|
323
|
+
non_null_fks = fact_df[fk_col].dropna()
|
|
324
|
+
orphan_mask = ~non_null_fks.isin(dim_keys)
|
|
325
|
+
orphan_count = int(orphan_mask.sum())
|
|
326
|
+
|
|
327
|
+
orphan_values = []
|
|
328
|
+
if orphan_count > 0:
|
|
329
|
+
orphan_values = list(non_null_fks[orphan_mask].unique()[:100])
|
|
330
|
+
|
|
331
|
+
is_valid = orphan_count == 0 and (relationship.nullable or null_count == 0)
|
|
332
|
+
|
|
333
|
+
return FKValidationResult(
|
|
334
|
+
relationship_name=relationship.name,
|
|
335
|
+
valid=is_valid,
|
|
336
|
+
total_rows=total_rows,
|
|
337
|
+
orphan_count=orphan_count,
|
|
338
|
+
null_count=null_count,
|
|
339
|
+
orphan_values=orphan_values,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def validate_fact(
|
|
343
|
+
self,
|
|
344
|
+
fact_df: Any,
|
|
345
|
+
fact_table: str,
|
|
346
|
+
context: EngineContext,
|
|
347
|
+
) -> FKValidationReport:
|
|
348
|
+
"""
|
|
349
|
+
Validate all FK relationships for a fact table.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
fact_df: Fact DataFrame to validate
|
|
353
|
+
fact_table: Fact table name
|
|
354
|
+
context: EngineContext with dimension data
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
FKValidationReport with all validation results
|
|
358
|
+
"""
|
|
359
|
+
ctx = get_logging_context()
|
|
360
|
+
start_time = time.time()
|
|
361
|
+
|
|
362
|
+
ctx.info("Starting FK validation", fact_table=fact_table)
|
|
363
|
+
|
|
364
|
+
relationships = self.registry.get_fact_relationships(fact_table)
|
|
365
|
+
|
|
366
|
+
if not relationships:
|
|
367
|
+
ctx.warning(
|
|
368
|
+
"No FK relationships defined",
|
|
369
|
+
fact_table=fact_table,
|
|
370
|
+
)
|
|
371
|
+
return FKValidationReport(
|
|
372
|
+
fact_table=fact_table,
|
|
373
|
+
all_valid=True,
|
|
374
|
+
total_relationships=0,
|
|
375
|
+
valid_relationships=0,
|
|
376
|
+
elapsed_ms=(time.time() - start_time) * 1000,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
results = []
|
|
380
|
+
all_orphans = []
|
|
381
|
+
|
|
382
|
+
for relationship in relationships:
|
|
383
|
+
result = self.validate_relationship(fact_df, relationship, context)
|
|
384
|
+
results.append(result)
|
|
385
|
+
|
|
386
|
+
if result.orphan_count > 0:
|
|
387
|
+
for orphan_val in result.orphan_values:
|
|
388
|
+
all_orphans.append(
|
|
389
|
+
OrphanRecord(
|
|
390
|
+
fact_key_value=orphan_val,
|
|
391
|
+
fact_key_column=relationship.fact_key,
|
|
392
|
+
dimension_table=relationship.dimension,
|
|
393
|
+
)
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
all_valid = all(r.valid for r in results)
|
|
397
|
+
valid_count = sum(1 for r in results if r.valid)
|
|
398
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
399
|
+
|
|
400
|
+
if all_valid:
|
|
401
|
+
ctx.info(
|
|
402
|
+
"FK validation passed",
|
|
403
|
+
fact_table=fact_table,
|
|
404
|
+
relationships=len(relationships),
|
|
405
|
+
)
|
|
406
|
+
else:
|
|
407
|
+
ctx.warning(
|
|
408
|
+
"FK validation failed",
|
|
409
|
+
fact_table=fact_table,
|
|
410
|
+
valid=valid_count,
|
|
411
|
+
total=len(relationships),
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return FKValidationReport(
|
|
415
|
+
fact_table=fact_table,
|
|
416
|
+
all_valid=all_valid,
|
|
417
|
+
total_relationships=len(relationships),
|
|
418
|
+
valid_relationships=valid_count,
|
|
419
|
+
results=results,
|
|
420
|
+
orphan_records=all_orphans,
|
|
421
|
+
elapsed_ms=elapsed_ms,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def get_orphan_records(
|
|
426
|
+
fact_df: Any,
|
|
427
|
+
relationship: RelationshipConfig,
|
|
428
|
+
dim_df: Any,
|
|
429
|
+
engine_type: EngineType,
|
|
430
|
+
) -> Any:
|
|
431
|
+
"""
|
|
432
|
+
Extract orphan records from a fact table.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
fact_df: Fact DataFrame
|
|
436
|
+
relationship: Relationship configuration
|
|
437
|
+
dim_df: Dimension DataFrame
|
|
438
|
+
engine_type: Engine type (SPARK or PANDAS)
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
DataFrame containing orphan records
|
|
442
|
+
"""
|
|
443
|
+
fk_col = relationship.fact_key
|
|
444
|
+
dk_col = relationship.dimension_key
|
|
445
|
+
|
|
446
|
+
if engine_type == EngineType.SPARK:
|
|
447
|
+
from pyspark.sql import functions as F
|
|
448
|
+
|
|
449
|
+
dim_keys = dim_df.select(F.col(dk_col).alias("_dim_key")).distinct()
|
|
450
|
+
non_null_facts = fact_df.filter(F.col(fk_col).isNotNull())
|
|
451
|
+
orphans = non_null_facts.join(
|
|
452
|
+
dim_keys,
|
|
453
|
+
non_null_facts[fk_col] == dim_keys["_dim_key"],
|
|
454
|
+
"left_anti",
|
|
455
|
+
)
|
|
456
|
+
return orphans
|
|
457
|
+
else:
|
|
458
|
+
dim_keys = set(dim_df[dk_col].dropna().unique())
|
|
459
|
+
non_null_mask = fact_df[fk_col].notna()
|
|
460
|
+
orphan_mask = ~fact_df[fk_col].isin(dim_keys) & non_null_mask
|
|
461
|
+
return fact_df[orphan_mask].copy()
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def validate_fk_on_load(
|
|
465
|
+
fact_df: Any,
|
|
466
|
+
relationships: List[RelationshipConfig],
|
|
467
|
+
context: EngineContext,
|
|
468
|
+
on_failure: str = "error",
|
|
469
|
+
) -> Any:
|
|
470
|
+
"""
|
|
471
|
+
Validate FK constraints and optionally filter orphans.
|
|
472
|
+
|
|
473
|
+
This is a convenience function for use in FactPattern.
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
fact_df: Fact DataFrame to validate
|
|
477
|
+
relationships: List of relationship configs
|
|
478
|
+
context: EngineContext with dimension data
|
|
479
|
+
on_failure: Action on failure ("error", "warn", "filter")
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
fact_df (possibly filtered if on_failure="filter")
|
|
483
|
+
|
|
484
|
+
Raises:
|
|
485
|
+
ValueError: If on_failure="error" and validation fails
|
|
486
|
+
"""
|
|
487
|
+
ctx = get_logging_context()
|
|
488
|
+
|
|
489
|
+
registry = RelationshipRegistry(relationships=relationships)
|
|
490
|
+
validator = FKValidator(registry)
|
|
491
|
+
|
|
492
|
+
for rel in relationships:
|
|
493
|
+
result = validator.validate_relationship(fact_df, rel, context)
|
|
494
|
+
|
|
495
|
+
if not result.valid:
|
|
496
|
+
if on_failure == "error":
|
|
497
|
+
raise ValueError(
|
|
498
|
+
f"FK validation failed for '{rel.name}': "
|
|
499
|
+
f"{result.orphan_count} orphans, {result.null_count} nulls. "
|
|
500
|
+
f"Sample orphan values: {result.orphan_values[:5]}"
|
|
501
|
+
)
|
|
502
|
+
elif on_failure == "warn":
|
|
503
|
+
ctx.warning(
|
|
504
|
+
f"FK validation warning for '{rel.name}': "
|
|
505
|
+
f"{result.orphan_count} orphans, {result.null_count} nulls"
|
|
506
|
+
)
|
|
507
|
+
elif on_failure == "filter":
|
|
508
|
+
try:
|
|
509
|
+
dim_df = context.get(rel.dimension)
|
|
510
|
+
except KeyError:
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
if context.engine_type == EngineType.SPARK:
|
|
514
|
+
from pyspark.sql import functions as F
|
|
515
|
+
|
|
516
|
+
dim_keys = dim_df.select(F.col(rel.dimension_key).alias("_fk_key")).distinct()
|
|
517
|
+
fact_df = fact_df.join(
|
|
518
|
+
dim_keys,
|
|
519
|
+
fact_df[rel.fact_key] == dim_keys["_fk_key"],
|
|
520
|
+
"inner",
|
|
521
|
+
).drop("_fk_key")
|
|
522
|
+
else:
|
|
523
|
+
dim_keys = set(dim_df[rel.dimension_key].dropna().unique())
|
|
524
|
+
fact_df = fact_df[fact_df[rel.fact_key].isin(dim_keys)].copy()
|
|
525
|
+
|
|
526
|
+
ctx.info(
|
|
527
|
+
f"Filtered orphans for '{rel.name}'",
|
|
528
|
+
remaining_rows=len(fact_df) if hasattr(fact_df, "__len__") else "N/A",
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
return fact_df
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def parse_relationships_config(config_dict: Dict[str, Any]) -> RelationshipRegistry:
|
|
535
|
+
"""
|
|
536
|
+
Parse relationships from a configuration dictionary.
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
config_dict: Config dict with "relationships" key
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
RelationshipRegistry instance
|
|
543
|
+
"""
|
|
544
|
+
relationships = []
|
|
545
|
+
for rel_dict in config_dict.get("relationships", []):
|
|
546
|
+
relationships.append(RelationshipConfig(**rel_dict))
|
|
547
|
+
return RelationshipRegistry(relationships=relationships)
|