odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,155 @@
1
+ """
2
+ Explanation Quality Linter
3
+ ===========================
4
+
5
+ Validates that explanations meet Odibi quality standards.
6
+ """
7
+
8
+ import re
9
+ from dataclasses import dataclass
10
+ from typing import List
11
+
12
+
13
+ @dataclass
14
+ class LintIssue:
15
+ """A linting issue found in an explanation."""
16
+
17
+ severity: str # "error", "warning", "info"
18
+ message: str
19
+ rule: str
20
+
21
+ def __str__(self):
22
+ symbol = {"error": "❌", "warning": "⚠️", "info": "ℹ️"}[self.severity]
23
+ return f"{symbol} {self.message} [{self.rule}]"
24
+
25
+
26
+ class ExplanationLinter:
27
+ """
28
+ Lints explanation text for quality issues.
29
+
30
+ Checks:
31
+ - Minimum length
32
+ - Required sections (Purpose, Details, Result)
33
+ - Generic/lazy phrases
34
+ - TODO placeholders
35
+ - Formula formatting
36
+ """
37
+
38
+ REQUIRED_SECTIONS = ["Purpose", "Details", "Result"]
39
+
40
+ LAZY_PHRASES = [
41
+ "calculates stuff",
42
+ "does things",
43
+ "processes data",
44
+ "handles records",
45
+ "TODO",
46
+ "[placeholder]",
47
+ "TBD",
48
+ "to be determined",
49
+ ]
50
+
51
+ MIN_LENGTH = 50 # characters
52
+
53
+ def __init__(self):
54
+ self.issues: List[LintIssue] = []
55
+
56
+ def lint(self, explanation: str, operation_name: str = "unknown") -> List[LintIssue]:
57
+ """
58
+ Lint an explanation and return issues.
59
+
60
+ Args:
61
+ explanation: The explanation text
62
+ operation_name: Name of the operation (for error messages)
63
+
64
+ Returns:
65
+ List of LintIssue objects
66
+ """
67
+ self.issues = []
68
+
69
+ if not explanation or not explanation.strip():
70
+ self.issues.append(
71
+ LintIssue(
72
+ severity="error",
73
+ message=f"Explanation for '{operation_name}' is empty",
74
+ rule="E001",
75
+ )
76
+ )
77
+ return self.issues
78
+
79
+ # Check length
80
+ self._check_length(explanation, operation_name)
81
+
82
+ # Check required sections
83
+ self._check_required_sections(explanation, operation_name)
84
+
85
+ # Check for lazy phrases
86
+ self._check_lazy_phrases(explanation, operation_name)
87
+
88
+ # Check formula formatting
89
+ self._check_formula_formatting(explanation, operation_name)
90
+
91
+ return self.issues
92
+
93
+ def _check_length(self, text: str, op_name: str):
94
+ """Check minimum length requirement."""
95
+ if len(text.strip()) < self.MIN_LENGTH:
96
+ self.issues.append(
97
+ LintIssue(
98
+ severity="error",
99
+ message=f"Explanation for '{op_name}' too short ({len(text.strip())} chars, minimum {self.MIN_LENGTH})",
100
+ rule="E002",
101
+ )
102
+ )
103
+
104
+ def _check_required_sections(self, text: str, op_name: str):
105
+ """Check for required sections."""
106
+ for section in self.REQUIRED_SECTIONS:
107
+ pattern = f"\\*\\*{section}:?\\*\\*"
108
+ if not re.search(pattern, text, re.IGNORECASE):
109
+ self.issues.append(
110
+ LintIssue(
111
+ severity="error",
112
+ message=f"Explanation for '{op_name}' missing required section: {section}",
113
+ rule="E003",
114
+ )
115
+ )
116
+
117
+ def _check_lazy_phrases(self, text: str, op_name: str):
118
+ """Check for generic/lazy phrases."""
119
+ text_lower = text.lower()
120
+ for phrase in self.LAZY_PHRASES:
121
+ if phrase.lower() in text_lower:
122
+ self.issues.append(
123
+ LintIssue(
124
+ severity="error",
125
+ message=f"Explanation for '{op_name}' contains generic phrase: '{phrase}'",
126
+ rule="E004",
127
+ )
128
+ )
129
+
130
+ def _check_formula_formatting(self, text: str, op_name: str):
131
+ """Check formula formatting."""
132
+ # If mentions "formula" but no code block
133
+ if "formula" in text.lower():
134
+ if "```" not in text:
135
+ self.issues.append(
136
+ LintIssue(
137
+ severity="warning",
138
+ message=f"Explanation for '{op_name}' mentions formula but no code block found",
139
+ rule="W001",
140
+ )
141
+ )
142
+
143
+ def has_errors(self) -> bool:
144
+ """Check if any errors were found."""
145
+ return any(issue.severity == "error" for issue in self.issues)
146
+
147
+ def format_issues(self) -> str:
148
+ """Format all issues as string."""
149
+ if not self.issues:
150
+ return "✅ No issues found"
151
+
152
+ lines = []
153
+ for issue in self.issues:
154
+ lines.append(str(issue))
155
+ return "\n".join(lines)
odibi/validation/fk.py ADDED
@@ -0,0 +1,547 @@
1
+ """
2
+ Foreign Key Validation Module
3
+ =============================
4
+
5
+ Declare and validate referential integrity between fact and dimension tables.
6
+
7
+ Features:
8
+ - Declare relationships in YAML
9
+ - Validate referential integrity on fact load
10
+ - Detect orphan records
11
+ - Generate lineage from relationships
12
+ - Integration with FactPattern
13
+
14
+ Example Config:
15
+ relationships:
16
+ - name: orders_to_customers
17
+ fact: fact_orders
18
+ dimension: dim_customer
19
+ fact_key: customer_sk
20
+ dimension_key: customer_sk
21
+
22
+ - name: orders_to_products
23
+ fact: fact_orders
24
+ dimension: dim_product
25
+ fact_key: product_sk
26
+ dimension_key: product_sk
27
+ """
28
+
29
+ import time
30
+ from dataclasses import dataclass, field
31
+ from typing import Any, Dict, List, Optional
32
+
33
+ from pydantic import BaseModel, Field, field_validator
34
+
35
+ from odibi.context import EngineContext
36
+ from odibi.enums import EngineType
37
+ from odibi.utils.logging_context import get_logging_context
38
+
39
+
40
+ class RelationshipConfig(BaseModel):
41
+ """
42
+ Configuration for a foreign key relationship.
43
+
44
+ Attributes:
45
+ name: Unique relationship identifier
46
+ fact: Fact table name
47
+ dimension: Dimension table name
48
+ fact_key: Foreign key column in fact table
49
+ dimension_key: Primary/surrogate key column in dimension
50
+ nullable: Whether nulls are allowed in fact_key
51
+ on_violation: Action on violation ("warn", "error", "quarantine")
52
+ """
53
+
54
+ name: str = Field(..., description="Unique relationship identifier")
55
+ fact: str = Field(..., description="Fact table name")
56
+ dimension: str = Field(..., description="Dimension table name")
57
+ fact_key: str = Field(..., description="FK column in fact table")
58
+ dimension_key: str = Field(..., description="PK/SK column in dimension")
59
+ nullable: bool = Field(default=False, description="Allow nulls in fact_key")
60
+ on_violation: str = Field(default="error", description="Action on violation")
61
+
62
+ @field_validator("name", "fact", "dimension", "fact_key", "dimension_key")
63
+ @classmethod
64
+ def validate_not_empty(cls, v: str, info) -> str:
65
+ if not v or not v.strip():
66
+ raise ValueError(
67
+ f"RelationshipConfig.{info.field_name} cannot be empty. "
68
+ f"Got: {v!r}. Provide a non-empty string value."
69
+ )
70
+ return v.strip()
71
+
72
+ @field_validator("on_violation")
73
+ @classmethod
74
+ def validate_on_violation(cls, v: str) -> str:
75
+ valid = ("warn", "error", "quarantine")
76
+ if v.lower() not in valid:
77
+ raise ValueError(f"Invalid on_violation value. Expected one of {valid}, got: {v!r}.")
78
+ return v.lower()
79
+
80
+
81
+ class RelationshipRegistry(BaseModel):
82
+ """
83
+ Registry of all declared relationships.
84
+
85
+ Attributes:
86
+ relationships: List of relationship configurations
87
+ """
88
+
89
+ relationships: List[RelationshipConfig] = Field(
90
+ default_factory=list, description="Relationship definitions"
91
+ )
92
+
93
+ def get_relationship(self, name: str) -> Optional[RelationshipConfig]:
94
+ """Get a relationship by name."""
95
+ for rel in self.relationships:
96
+ if rel.name.lower() == name.lower():
97
+ return rel
98
+ return None
99
+
100
+ def get_fact_relationships(self, fact_table: str) -> List[RelationshipConfig]:
101
+ """Get all relationships for a fact table."""
102
+ return [rel for rel in self.relationships if rel.fact.lower() == fact_table.lower()]
103
+
104
+ def get_dimension_relationships(self, dim_table: str) -> List[RelationshipConfig]:
105
+ """Get all relationships referencing a dimension."""
106
+ return [rel for rel in self.relationships if rel.dimension.lower() == dim_table.lower()]
107
+
108
+ def generate_lineage(self) -> Dict[str, List[str]]:
109
+ """
110
+ Generate lineage map from relationships.
111
+
112
+ Returns:
113
+ Dict mapping fact tables to their dimension dependencies
114
+ """
115
+ lineage: Dict[str, List[str]] = {}
116
+ for rel in self.relationships:
117
+ if rel.fact not in lineage:
118
+ lineage[rel.fact] = []
119
+ if rel.dimension not in lineage[rel.fact]:
120
+ lineage[rel.fact].append(rel.dimension)
121
+ return lineage
122
+
123
+
124
+ @dataclass
125
+ class OrphanRecord:
126
+ """Details of an orphan record."""
127
+
128
+ fact_key_value: Any
129
+ fact_key_column: str
130
+ dimension_table: str
131
+ row_index: Optional[int] = None
132
+
133
+
134
+ @dataclass
135
+ class FKValidationResult:
136
+ """Result of FK validation."""
137
+
138
+ relationship_name: str
139
+ valid: bool
140
+ total_rows: int
141
+ orphan_count: int
142
+ null_count: int
143
+ orphan_values: List[Any] = field(default_factory=list)
144
+ elapsed_ms: float = 0.0
145
+ error: Optional[str] = None
146
+
147
+
148
+ @dataclass
149
+ class FKValidationReport:
150
+ """Complete FK validation report for a fact table."""
151
+
152
+ fact_table: str
153
+ all_valid: bool
154
+ total_relationships: int
155
+ valid_relationships: int
156
+ results: List[FKValidationResult] = field(default_factory=list)
157
+ orphan_records: List[OrphanRecord] = field(default_factory=list)
158
+ elapsed_ms: float = 0.0
159
+
160
+
161
+ class FKValidator:
162
+ """
163
+ Validate foreign key relationships between fact and dimension tables.
164
+
165
+ Usage:
166
+ registry = RelationshipRegistry(relationships=[...])
167
+ validator = FKValidator(registry)
168
+ report = validator.validate_fact(fact_df, "fact_orders", context)
169
+ """
170
+
171
+ def __init__(self, registry: RelationshipRegistry):
172
+ """
173
+ Initialize with relationship registry.
174
+
175
+ Args:
176
+ registry: RelationshipRegistry with relationship definitions
177
+ """
178
+ self.registry = registry
179
+
180
+ def validate_relationship(
181
+ self,
182
+ fact_df: Any,
183
+ relationship: RelationshipConfig,
184
+ context: EngineContext,
185
+ ) -> FKValidationResult:
186
+ """
187
+ Validate a single FK relationship.
188
+
189
+ Args:
190
+ fact_df: Fact DataFrame to validate
191
+ relationship: Relationship configuration
192
+ context: EngineContext with dimension data
193
+
194
+ Returns:
195
+ FKValidationResult with validation details
196
+ """
197
+ ctx = get_logging_context()
198
+ start_time = time.time()
199
+
200
+ ctx.debug(
201
+ "Validating FK relationship",
202
+ relationship=relationship.name,
203
+ fact=relationship.fact,
204
+ dimension=relationship.dimension,
205
+ )
206
+
207
+ try:
208
+ dim_df = context.get(relationship.dimension)
209
+ except KeyError:
210
+ elapsed_ms = (time.time() - start_time) * 1000
211
+ return FKValidationResult(
212
+ relationship_name=relationship.name,
213
+ valid=False,
214
+ total_rows=0,
215
+ orphan_count=0,
216
+ null_count=0,
217
+ elapsed_ms=elapsed_ms,
218
+ error=f"Dimension table '{relationship.dimension}' not found",
219
+ )
220
+
221
+ try:
222
+ if context.engine_type == EngineType.SPARK:
223
+ result = self._validate_spark(fact_df, dim_df, relationship)
224
+ else:
225
+ result = self._validate_pandas(fact_df, dim_df, relationship)
226
+
227
+ elapsed_ms = (time.time() - start_time) * 1000
228
+ result.elapsed_ms = elapsed_ms
229
+
230
+ if result.valid:
231
+ ctx.debug(
232
+ "FK validation passed",
233
+ relationship=relationship.name,
234
+ total_rows=result.total_rows,
235
+ )
236
+ else:
237
+ ctx.warning(
238
+ "FK validation failed",
239
+ relationship=relationship.name,
240
+ orphan_count=result.orphan_count,
241
+ null_count=result.null_count,
242
+ )
243
+
244
+ return result
245
+
246
+ except Exception as e:
247
+ elapsed_ms = (time.time() - start_time) * 1000
248
+ ctx.error(
249
+ f"FK validation error: {e}",
250
+ relationship=relationship.name,
251
+ )
252
+ return FKValidationResult(
253
+ relationship_name=relationship.name,
254
+ valid=False,
255
+ total_rows=0,
256
+ orphan_count=0,
257
+ null_count=0,
258
+ elapsed_ms=elapsed_ms,
259
+ error=str(e),
260
+ )
261
+
262
+ def _validate_spark(
263
+ self,
264
+ fact_df: Any,
265
+ dim_df: Any,
266
+ relationship: RelationshipConfig,
267
+ ) -> FKValidationResult:
268
+ """Validate using Spark."""
269
+ from pyspark.sql import functions as F
270
+
271
+ fk_col = relationship.fact_key
272
+ dk_col = relationship.dimension_key
273
+
274
+ total_rows = fact_df.count()
275
+
276
+ null_count = fact_df.filter(F.col(fk_col).isNull()).count()
277
+
278
+ dim_keys = dim_df.select(F.col(dk_col).alias("_dim_key")).distinct()
279
+
280
+ non_null_facts = fact_df.filter(F.col(fk_col).isNotNull())
281
+ orphans = non_null_facts.join(
282
+ dim_keys,
283
+ non_null_facts[fk_col] == dim_keys["_dim_key"],
284
+ "left_anti",
285
+ )
286
+
287
+ orphan_count = orphans.count()
288
+
289
+ orphan_values = []
290
+ if orphan_count > 0 and orphan_count <= 100:
291
+ orphan_values = [
292
+ row[fk_col] for row in orphans.select(fk_col).distinct().limit(100).collect()
293
+ ]
294
+
295
+ is_valid = orphan_count == 0 and (relationship.nullable or null_count == 0)
296
+
297
+ return FKValidationResult(
298
+ relationship_name=relationship.name,
299
+ valid=is_valid,
300
+ total_rows=total_rows,
301
+ orphan_count=orphan_count,
302
+ null_count=null_count,
303
+ orphan_values=orphan_values,
304
+ )
305
+
306
+ def _validate_pandas(
307
+ self,
308
+ fact_df: Any,
309
+ dim_df: Any,
310
+ relationship: RelationshipConfig,
311
+ ) -> FKValidationResult:
312
+ """Validate using Pandas."""
313
+
314
+ fk_col = relationship.fact_key
315
+ dk_col = relationship.dimension_key
316
+
317
+ total_rows = len(fact_df)
318
+
319
+ null_count = int(fact_df[fk_col].isna().sum())
320
+
321
+ dim_keys = set(dim_df[dk_col].dropna().unique())
322
+
323
+ non_null_fks = fact_df[fk_col].dropna()
324
+ orphan_mask = ~non_null_fks.isin(dim_keys)
325
+ orphan_count = int(orphan_mask.sum())
326
+
327
+ orphan_values = []
328
+ if orphan_count > 0:
329
+ orphan_values = list(non_null_fks[orphan_mask].unique()[:100])
330
+
331
+ is_valid = orphan_count == 0 and (relationship.nullable or null_count == 0)
332
+
333
+ return FKValidationResult(
334
+ relationship_name=relationship.name,
335
+ valid=is_valid,
336
+ total_rows=total_rows,
337
+ orphan_count=orphan_count,
338
+ null_count=null_count,
339
+ orphan_values=orphan_values,
340
+ )
341
+
342
+ def validate_fact(
343
+ self,
344
+ fact_df: Any,
345
+ fact_table: str,
346
+ context: EngineContext,
347
+ ) -> FKValidationReport:
348
+ """
349
+ Validate all FK relationships for a fact table.
350
+
351
+ Args:
352
+ fact_df: Fact DataFrame to validate
353
+ fact_table: Fact table name
354
+ context: EngineContext with dimension data
355
+
356
+ Returns:
357
+ FKValidationReport with all validation results
358
+ """
359
+ ctx = get_logging_context()
360
+ start_time = time.time()
361
+
362
+ ctx.info("Starting FK validation", fact_table=fact_table)
363
+
364
+ relationships = self.registry.get_fact_relationships(fact_table)
365
+
366
+ if not relationships:
367
+ ctx.warning(
368
+ "No FK relationships defined",
369
+ fact_table=fact_table,
370
+ )
371
+ return FKValidationReport(
372
+ fact_table=fact_table,
373
+ all_valid=True,
374
+ total_relationships=0,
375
+ valid_relationships=0,
376
+ elapsed_ms=(time.time() - start_time) * 1000,
377
+ )
378
+
379
+ results = []
380
+ all_orphans = []
381
+
382
+ for relationship in relationships:
383
+ result = self.validate_relationship(fact_df, relationship, context)
384
+ results.append(result)
385
+
386
+ if result.orphan_count > 0:
387
+ for orphan_val in result.orphan_values:
388
+ all_orphans.append(
389
+ OrphanRecord(
390
+ fact_key_value=orphan_val,
391
+ fact_key_column=relationship.fact_key,
392
+ dimension_table=relationship.dimension,
393
+ )
394
+ )
395
+
396
+ all_valid = all(r.valid for r in results)
397
+ valid_count = sum(1 for r in results if r.valid)
398
+ elapsed_ms = (time.time() - start_time) * 1000
399
+
400
+ if all_valid:
401
+ ctx.info(
402
+ "FK validation passed",
403
+ fact_table=fact_table,
404
+ relationships=len(relationships),
405
+ )
406
+ else:
407
+ ctx.warning(
408
+ "FK validation failed",
409
+ fact_table=fact_table,
410
+ valid=valid_count,
411
+ total=len(relationships),
412
+ )
413
+
414
+ return FKValidationReport(
415
+ fact_table=fact_table,
416
+ all_valid=all_valid,
417
+ total_relationships=len(relationships),
418
+ valid_relationships=valid_count,
419
+ results=results,
420
+ orphan_records=all_orphans,
421
+ elapsed_ms=elapsed_ms,
422
+ )
423
+
424
+
425
+ def get_orphan_records(
426
+ fact_df: Any,
427
+ relationship: RelationshipConfig,
428
+ dim_df: Any,
429
+ engine_type: EngineType,
430
+ ) -> Any:
431
+ """
432
+ Extract orphan records from a fact table.
433
+
434
+ Args:
435
+ fact_df: Fact DataFrame
436
+ relationship: Relationship configuration
437
+ dim_df: Dimension DataFrame
438
+ engine_type: Engine type (SPARK or PANDAS)
439
+
440
+ Returns:
441
+ DataFrame containing orphan records
442
+ """
443
+ fk_col = relationship.fact_key
444
+ dk_col = relationship.dimension_key
445
+
446
+ if engine_type == EngineType.SPARK:
447
+ from pyspark.sql import functions as F
448
+
449
+ dim_keys = dim_df.select(F.col(dk_col).alias("_dim_key")).distinct()
450
+ non_null_facts = fact_df.filter(F.col(fk_col).isNotNull())
451
+ orphans = non_null_facts.join(
452
+ dim_keys,
453
+ non_null_facts[fk_col] == dim_keys["_dim_key"],
454
+ "left_anti",
455
+ )
456
+ return orphans
457
+ else:
458
+ dim_keys = set(dim_df[dk_col].dropna().unique())
459
+ non_null_mask = fact_df[fk_col].notna()
460
+ orphan_mask = ~fact_df[fk_col].isin(dim_keys) & non_null_mask
461
+ return fact_df[orphan_mask].copy()
462
+
463
+
464
+ def validate_fk_on_load(
465
+ fact_df: Any,
466
+ relationships: List[RelationshipConfig],
467
+ context: EngineContext,
468
+ on_failure: str = "error",
469
+ ) -> Any:
470
+ """
471
+ Validate FK constraints and optionally filter orphans.
472
+
473
+ This is a convenience function for use in FactPattern.
474
+
475
+ Args:
476
+ fact_df: Fact DataFrame to validate
477
+ relationships: List of relationship configs
478
+ context: EngineContext with dimension data
479
+ on_failure: Action on failure ("error", "warn", "filter")
480
+
481
+ Returns:
482
+ fact_df (possibly filtered if on_failure="filter")
483
+
484
+ Raises:
485
+ ValueError: If on_failure="error" and validation fails
486
+ """
487
+ ctx = get_logging_context()
488
+
489
+ registry = RelationshipRegistry(relationships=relationships)
490
+ validator = FKValidator(registry)
491
+
492
+ for rel in relationships:
493
+ result = validator.validate_relationship(fact_df, rel, context)
494
+
495
+ if not result.valid:
496
+ if on_failure == "error":
497
+ raise ValueError(
498
+ f"FK validation failed for '{rel.name}': "
499
+ f"{result.orphan_count} orphans, {result.null_count} nulls. "
500
+ f"Sample orphan values: {result.orphan_values[:5]}"
501
+ )
502
+ elif on_failure == "warn":
503
+ ctx.warning(
504
+ f"FK validation warning for '{rel.name}': "
505
+ f"{result.orphan_count} orphans, {result.null_count} nulls"
506
+ )
507
+ elif on_failure == "filter":
508
+ try:
509
+ dim_df = context.get(rel.dimension)
510
+ except KeyError:
511
+ continue
512
+
513
+ if context.engine_type == EngineType.SPARK:
514
+ from pyspark.sql import functions as F
515
+
516
+ dim_keys = dim_df.select(F.col(rel.dimension_key).alias("_fk_key")).distinct()
517
+ fact_df = fact_df.join(
518
+ dim_keys,
519
+ fact_df[rel.fact_key] == dim_keys["_fk_key"],
520
+ "inner",
521
+ ).drop("_fk_key")
522
+ else:
523
+ dim_keys = set(dim_df[rel.dimension_key].dropna().unique())
524
+ fact_df = fact_df[fact_df[rel.fact_key].isin(dim_keys)].copy()
525
+
526
+ ctx.info(
527
+ f"Filtered orphans for '{rel.name}'",
528
+ remaining_rows=len(fact_df) if hasattr(fact_df, "__len__") else "N/A",
529
+ )
530
+
531
+ return fact_df
532
+
533
+
534
+ def parse_relationships_config(config_dict: Dict[str, Any]) -> RelationshipRegistry:
535
+ """
536
+ Parse relationships from a configuration dictionary.
537
+
538
+ Args:
539
+ config_dict: Config dict with "relationships" key
540
+
541
+ Returns:
542
+ RelationshipRegistry instance
543
+ """
544
+ relationships = []
545
+ for rel_dict in config_dict.get("relationships", []):
546
+ relationships.append(RelationshipConfig(**rel_dict))
547
+ return RelationshipRegistry(relationships=relationships)