odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,520 @@
1
+ """
2
+ Delta Lake Diagnostics
3
+ ======================
4
+
5
+ Tools for analyzing Delta Lake tables, history, and drift.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Dict, List, Optional
10
+
11
+
12
+ @dataclass
13
+ class DeltaDiffResult:
14
+ """Result of comparing two Delta table versions."""
15
+
16
+ table_path: str
17
+ version_a: int
18
+ version_b: int
19
+
20
+ # Metadata changes
21
+ rows_change: int
22
+ files_change: int
23
+ size_change_bytes: int
24
+
25
+ # Schema changes
26
+ schema_added: List[str]
27
+ schema_removed: List[str]
28
+
29
+ schema_current: Optional[List[str]] = None
30
+ schema_previous: Optional[List[str]] = None
31
+
32
+ rows_added: Optional[int] = None
33
+ rows_removed: Optional[int] = None
34
+ rows_updated: Optional[int] = None
35
+
36
+ # Operation info
37
+ operations: List[str] = None # List of operations that happened between versions
38
+
39
+ # Data Diff Samples (Optional)
40
+ sample_added: Optional[List[Dict[str, Any]]] = None
41
+ sample_removed: Optional[List[Dict[str, Any]]] = None
42
+ sample_updated: Optional[List[Dict[str, Any]]] = None
43
+
44
+
45
+ def get_delta_diff(
46
+ table_path: str,
47
+ version_a: int,
48
+ version_b: int,
49
+ spark: Optional[Any] = None,
50
+ deep: bool = False,
51
+ keys: Optional[List[str]] = None,
52
+ ) -> DeltaDiffResult:
53
+ """
54
+ Compare two versions of a Delta table.
55
+
56
+ Args:
57
+ table_path: Path to Delta table
58
+ version_a: Start version
59
+ version_b: End version
60
+ spark: Optional SparkSession. If None, uses deltalake (Pandas).
61
+ deep: If True, perform expensive row-by-row comparison (exceptAll).
62
+ If False, rely on metadata and stats.
63
+ keys: List of primary key columns for detecting updates.
64
+
65
+ Returns:
66
+ DeltaDiffResult object
67
+ """
68
+ if spark:
69
+ return _get_delta_diff_spark(spark, table_path, version_a, version_b, deep, keys)
70
+ else:
71
+ return _get_delta_diff_pandas(table_path, version_a, version_b, deep, keys)
72
+
73
+
74
+ def _get_delta_diff_spark(
75
+ spark: Any,
76
+ table_path: str,
77
+ version_a: int,
78
+ version_b: int,
79
+ deep: bool = False,
80
+ keys: Optional[List[str]] = None,
81
+ ) -> DeltaDiffResult:
82
+ """Spark implementation of delta diff."""
83
+ try:
84
+ from delta.tables import DeltaTable
85
+ except ImportError:
86
+ raise ImportError("Delta Lake support requires 'delta-spark'")
87
+
88
+ dt = DeltaTable.forPath(spark, table_path)
89
+ history = dt.history().collect()
90
+
91
+ # Filter history between versions
92
+ # We want everything happening AFTER version_a up to version_b
93
+ # History is usually reverse ordered, but let's filter safely
94
+ relevant_commits = [
95
+ row
96
+ for row in history
97
+ if min(version_a, version_b) < row["version"] <= max(version_a, version_b)
98
+ ]
99
+
100
+ operations = [row["operation"] for row in relevant_commits]
101
+
102
+ # Calculate expected row changes from metrics if available
103
+ rows_change = 0
104
+ files_change = 0
105
+ bytes_change = 0
106
+
107
+ for commit in relevant_commits:
108
+ metrics = commit.get("operationMetrics", {}) or {}
109
+
110
+ # This is heuristic based on operation type, but usually:
111
+ # Inserted - Deleted
112
+ inserted = int(metrics.get("numTargetRowsInserted", 0) or metrics.get("numOutputRows", 0))
113
+ deleted = int(metrics.get("numTargetRowsDeleted", 0))
114
+
115
+ # Direction matters. If we go a -> b and b > a, we sum up.
116
+ # If b < a, we revert. Assuming a < b here for simplicity of diff
117
+ factor = 1 if version_b > version_a else -1
118
+
119
+ rows_change += (inserted - deleted) * factor
120
+
121
+ # Files
122
+ files_added = int(metrics.get("numFilesAdded", 0) or metrics.get("numAddedFiles", 0))
123
+ files_removed = int(metrics.get("numFilesRemoved", 0) or metrics.get("numRemovedFiles", 0))
124
+ files_change += (files_added - files_removed) * factor
125
+
126
+ # Bytes
127
+ bytes_added = int(metrics.get("numBytesAdded", 0) or metrics.get("numAddedBytes", 0))
128
+ bytes_removed = int(metrics.get("numBytesRemoved", 0) or metrics.get("numRemovedBytes", 0))
129
+ bytes_change += (bytes_added - bytes_removed) * factor
130
+
131
+ # Get snapshots for schema
132
+ # Note: Spark is lazy, so defining DF is cheap, but we need schema.
133
+ # We can get schema from history? No, only from snapshot.
134
+ df_a = spark.read.format("delta").option("versionAsOf", version_a).load(table_path)
135
+ df_b = spark.read.format("delta").option("versionAsOf", version_b).load(table_path)
136
+
137
+ schema_a = set(df_a.columns)
138
+ schema_b = set(df_b.columns)
139
+
140
+ # Deep Diff Logic
141
+ added_rows = None
142
+ removed_rows = None
143
+ updated_rows = None
144
+ rows_added_count = None
145
+ rows_removed_count = None
146
+ rows_updated_count = None
147
+
148
+ if deep:
149
+ # Actual row counts (authoritative vs metrics heuristic)
150
+ rows_a = df_a.count()
151
+ rows_b = df_b.count()
152
+ rows_change = rows_b - rows_a # Override heuristic
153
+
154
+ common_cols = list(schema_a.intersection(schema_b))
155
+ if common_cols:
156
+ df_a_common = df_a.select(*common_cols)
157
+ df_b_common = df_b.select(*common_cols)
158
+
159
+ if keys and set(keys).issubset(common_cols):
160
+ # --- Spark Key-Based Diff ---
161
+ # Join on keys to find Added, Removed, and Updated
162
+
163
+ # 1. Added: In B but not in A (based on keys)
164
+ # df_b_common left_anti df_a_common on keys
165
+ diff_added = df_b_common.join(df_a_common, keys, "left_anti")
166
+ rows_added_count = diff_added.count()
167
+ added_rows = [row.asDict() for row in diff_added.limit(10).collect()]
168
+
169
+ # 2. Removed: In A but not in B (based on keys)
170
+ # df_a_common left_anti df_b_common on keys
171
+ diff_removed = df_a_common.join(df_b_common, keys, "left_anti")
172
+ rows_removed_count = diff_removed.count()
173
+ removed_rows = [row.asDict() for row in diff_removed.limit(10).collect()]
174
+
175
+ # 3. Updates: In both (inner join), but value columns differ
176
+ value_cols = [c for c in common_cols if c not in keys]
177
+
178
+ # Rename columns in A to avoid ambiguity
179
+ # We can alias DataFrames
180
+ df_a_aliased = df_a_common.alias("a")
181
+ df_b_aliased = df_b_common.alias("b")
182
+
183
+ # Build filter condition
184
+ from pyspark.sql import functions as F
185
+
186
+ # Start with False
187
+ change_condition = F.lit(False)
188
+
189
+ for col in value_cols:
190
+ # logical_or of existing condition AND (col_a != col_b)
191
+ # utilizing equalNullSafe inverted: not(a <=> b)
192
+ col_changed = ~F.col(f"a.{col}").eqNullSafe(F.col(f"b.{col}"))
193
+ change_condition = change_condition | col_changed
194
+
195
+ # Inner Join + Filter
196
+ # Join condition is equality on keys
197
+ join_cond = [F.col(f"a.{k}") == F.col(f"b.{k}") for k in keys]
198
+
199
+ diff_updated = (
200
+ df_b_aliased.join(df_a_aliased, join_cond, "inner")
201
+ .filter(change_condition)
202
+ .select("b.*") # We return the 'new' state
203
+ )
204
+
205
+ rows_updated_count = diff_updated.count()
206
+
207
+ # Let's grab the top 10 updated rows (new state)
208
+ updated_rows = [row.asDict() for row in diff_updated.limit(10).collect()]
209
+
210
+ else:
211
+ # Fallback to Set Diff if keys not supported/implemented fully for Spark yet
212
+ # or if keys not provided
213
+ diff_added = df_b_common.exceptAll(df_a_common)
214
+ diff_removed = df_a_common.exceptAll(df_b_common)
215
+
216
+ # Get counts
217
+ rows_added_count = diff_added.count()
218
+ rows_removed_count = diff_removed.count()
219
+
220
+ added_rows = [row.asDict() for row in diff_added.limit(10).collect()]
221
+ removed_rows = [row.asDict() for row in diff_removed.limit(10).collect()]
222
+
223
+ return DeltaDiffResult(
224
+ table_path=table_path,
225
+ version_a=version_a,
226
+ version_b=version_b,
227
+ rows_change=rows_change,
228
+ files_change=files_change,
229
+ size_change_bytes=bytes_change,
230
+ schema_added=list(schema_b - schema_a),
231
+ schema_removed=list(schema_a - schema_b),
232
+ schema_current=sorted(list(schema_b)),
233
+ schema_previous=sorted(list(schema_a)),
234
+ rows_added=rows_added_count,
235
+ rows_removed=rows_removed_count,
236
+ rows_updated=rows_updated_count,
237
+ sample_added=added_rows,
238
+ sample_removed=removed_rows,
239
+ sample_updated=updated_rows,
240
+ operations_between=operations,
241
+ )
242
+
243
+
244
+ def _get_delta_diff_pandas(
245
+ table_path: str,
246
+ version_a: int,
247
+ version_b: int,
248
+ deep: bool = False,
249
+ keys: Optional[List[str]] = None,
250
+ ) -> DeltaDiffResult:
251
+ """Pandas (deltalake) implementation of delta diff."""
252
+ try:
253
+ import pandas as pd
254
+ from deltalake import DeltaTable
255
+ except ImportError:
256
+ raise ImportError("Delta Lake support requires 'deltalake' and 'pandas'")
257
+
258
+ dt = DeltaTable(table_path)
259
+
260
+ # History
261
+ history = dt.history()
262
+ relevant_commits = [
263
+ h for h in history if min(version_a, version_b) < h["version"] <= max(version_a, version_b)
264
+ ]
265
+ operations = [h["operation"] for h in relevant_commits]
266
+
267
+ # Heuristics for metrics not easily available in pandas wrapper directly per commit object in standard history
268
+ # But we can just use len() since we load the table anyway in pandas logic?
269
+ # Wait, loading entire table in pandas is expensive.
270
+ # deltalake supports 'file_uris()' which is cheap.
271
+
272
+ # Snapshots
273
+ dt.load_as_version(version_a)
274
+ # Getting schema without loading data?
275
+ # Check for API availability (breaking changes in deltalake 0.15+)
276
+ schema_obj = dt.schema()
277
+ if hasattr(schema_obj, "to_pyarrow"):
278
+ arrow_schema_a = schema_obj.to_pyarrow()
279
+ else:
280
+ arrow_schema_a = schema_obj.to_arrow()
281
+
282
+ schema_a = set(arrow_schema_a.names)
283
+
284
+ # For row count without loading:
285
+ # dt.to_pyarrow_dataset().count_rows() ??
286
+ # Currently deltalake 0.10+ has rudimentary stats.
287
+ # Let's assume for Pandas local execution, data is small enough to load OR we skip stats.
288
+ # Actually, let's just load head(0) for schema if possible? No, dt.to_pandas() loads all.
289
+
290
+ # Optimization: Use pyarrow dataset scanner count if available
291
+ try:
292
+ rows_a = len(dt.to_pandas()) # Fallback for now
293
+ except Exception:
294
+ rows_a = 0
295
+
296
+ # If deep=False, we might want to avoid to_pandas().
297
+ # But `deltalake` lib is optimized for single node.
298
+ # Let's assume we load it if we can.
299
+ df_a = dt.to_pandas()
300
+
301
+ dt.load_as_version(version_b)
302
+ df_b = dt.to_pandas()
303
+
304
+ rows_b = len(df_b)
305
+ schema_b = set(df_b.columns)
306
+
307
+ rows_change = rows_b - rows_a
308
+
309
+ added_rows = None
310
+ removed_rows = None
311
+ updated_rows = None
312
+ rows_added_count = None
313
+ rows_removed_count = None
314
+ rows_updated_count = None
315
+
316
+ if deep:
317
+ # Compute Data Diff
318
+ # Pandas doesn't have exceptAll. We use merge with indicator.
319
+ common_cols = list(schema_a.intersection(schema_b))
320
+
321
+ if common_cols:
322
+ # DO NOT restrict inputs to common_cols yet, or we lose new/old data for samples
323
+
324
+ if keys and set(keys).issubset(common_cols):
325
+ # --- KEY-BASED DIFF (Updates Supported) ---
326
+ # Outer merge on KEYS only
327
+ merged = df_b.merge(
328
+ df_a, on=keys, how="outer", suffixes=("", "_old"), indicator=True
329
+ )
330
+
331
+ # Added: Key in B only
332
+ added_df = merged[merged["_merge"] == "left_only"]
333
+
334
+ # Removed: Key in A only
335
+ removed_df = merged[merged["_merge"] == "right_only"]
336
+
337
+ # Potential Updates: Key in Both
338
+ both_df = merged[merged["_merge"] == "both"]
339
+
340
+ # For "both", check if value cols changed
341
+ # We need to compare common cols that are not keys
342
+ value_cols = [c for c in common_cols if c not in keys]
343
+
344
+ updated_records = []
345
+
346
+ for _, row in both_df.iterrows():
347
+ changes = {}
348
+ has_change = False
349
+ for col in value_cols:
350
+ new_val = row[col]
351
+ old_val = row[f"{col}_old"]
352
+
353
+ # Handle nulls/NaN equality
354
+ if pd.isna(new_val) and pd.isna(old_val):
355
+ continue
356
+ if new_val != old_val:
357
+ changes[col] = {"old": old_val, "new": new_val}
358
+ has_change = True
359
+
360
+ if has_change:
361
+ # Build a record that has Keys + Changes
362
+ rec = {k: row[k] for k in keys}
363
+ rec["_changes"] = changes
364
+ updated_records.append(rec)
365
+
366
+ rows_added_count = len(added_df)
367
+ rows_removed_count = len(removed_df)
368
+ rows_updated_count = len(updated_records)
369
+
370
+ # Format added/removed to regular dicts (drop _old cols and _merge)
371
+ # For Added, we want ALL columns in B (schema_b)
372
+ # Note: added_df comes from df_b mostly, but merged might have _old cols (NaNs)
373
+ # We select columns that are in schema_b
374
+ cols_b = list(schema_b)
375
+ added_rows = added_df[cols_b].head(10).to_dict("records")
376
+
377
+ # For Removed, we want ALL columns in A (schema_a)
378
+ # 'removed_df' has columns from B (NaN) and columns from A (with _old suffix usually, OR common ones)
379
+ # Wait, merge suffixes apply to overlapping columns.
380
+ # Keys are shared.
381
+ # Columns unique to B are present (NaN).
382
+ # Columns unique to A are present?
383
+ # If unique to A (dropped col), it's in df_a but not df_b.
384
+ # Merge retains it. Does it have suffix?
385
+ # No, if not in df_b, no collision -> no suffix.
386
+ # BUT, common columns have collision -> suffix.
387
+
388
+ # Reconstruct deleted row:
389
+ # 1. Keys (no suffix)
390
+ # 2. Common non-keys (suffix _old)
391
+ # 3. Unique to A (no suffix)
392
+ removed_clean = []
393
+ for _, row in removed_df.head(10).iterrows():
394
+ rec = {}
395
+ for col in schema_a:
396
+ if col in keys:
397
+ rec[col] = row[col]
398
+ elif col in common_cols:
399
+ # It was common, so it collided. In right_only, we want the 'right' version.
400
+ # Suffix applied to 'left' (B) is "" and 'right' (A) is "_old".
401
+ rec[col] = row[f"{col}_old"]
402
+ else:
403
+ # Unique to A (deleted column). No collision.
404
+ if col in row:
405
+ rec[col] = row[col]
406
+ removed_clean.append(rec)
407
+ removed_rows = removed_clean
408
+
409
+ updated_rows = updated_records[:10]
410
+
411
+ else:
412
+ # --- SET-BASED DIFF (No Keys) ---
413
+ # Merge on all common columns
414
+ # Note: We can't easily detect updates here, just Add/Remove
415
+ # If we merge on common_cols, we find rows that match on those.
416
+ merged = df_b.merge(df_a, on=common_cols, how="outer", indicator=True)
417
+
418
+ # Rows only in B (New/Added) -> left_only
419
+ added_df = merged[merged["_merge"] == "left_only"]
420
+
421
+ # Rows only in A (Old/Removed) -> right_only
422
+ removed_df = merged[merged["_merge"] == "right_only"]
423
+
424
+ rows_added_count = len(added_df)
425
+ rows_removed_count = len(removed_df)
426
+
427
+ # For Added, show columns from B
428
+ cols_b = list(schema_b)
429
+ # Filter to cols_b that exist in merged (should be all)
430
+ # Note: merged might have duplicate columns if not in 'on' list?
431
+ # Yes, if B has col X and A has col X, and X is NOT in common_cols (impossible by def), it would duplicate.
432
+ # Columns in B but not A (Added cols) -> No collision -> Present.
433
+ # Columns in common -> Joined -> Present.
434
+ added_rows = added_df[cols_b].head(10).to_dict("records")
435
+
436
+ # For Removed, show columns from A
437
+ cols_a = list(schema_a)
438
+ removed_rows = removed_df[cols_a].head(10).to_dict("records")
439
+
440
+ return DeltaDiffResult(
441
+ table_path=table_path,
442
+ version_a=version_a,
443
+ version_b=version_b,
444
+ rows_change=rows_change,
445
+ files_change=0,
446
+ size_change_bytes=0,
447
+ schema_added=list(schema_b - schema_a),
448
+ schema_removed=list(schema_a - schema_b),
449
+ schema_current=sorted(list(schema_b)),
450
+ schema_previous=sorted(list(schema_a)),
451
+ rows_added=rows_added_count,
452
+ rows_removed=rows_removed_count,
453
+ rows_updated=rows_updated_count,
454
+ sample_added=added_rows,
455
+ sample_removed=removed_rows,
456
+ sample_updated=updated_rows,
457
+ operations=operations,
458
+ )
459
+
460
+
461
+ def detect_drift(
462
+ table_path: str,
463
+ current_version: int,
464
+ baseline_version: int,
465
+ spark: Optional[Any] = None,
466
+ threshold_pct: float = 10.0,
467
+ ) -> Optional[str]:
468
+ """
469
+ Check for significant drift between versions.
470
+
471
+ Args:
472
+ table_path: Path to Delta table
473
+ current_version: Current version
474
+ baseline_version: Baseline version
475
+ spark: Optional SparkSession
476
+ threshold_pct: Row count change percentage to trigger warning
477
+
478
+ Returns:
479
+ Warning message if drift detected, None otherwise
480
+ """
481
+ diff = get_delta_diff(table_path, baseline_version, current_version, spark=spark)
482
+
483
+ # Check schema drift
484
+ if diff.schema_added or diff.schema_removed:
485
+ return (
486
+ f"Schema drift detected: "
487
+ f"+{len(diff.schema_added)} columns, -{len(diff.schema_removed)} columns"
488
+ )
489
+
490
+ # For row count baseline, we can calculate it from current - change?
491
+ # Or read it again.
492
+ # Let's optimize: we don't have base_count in DiffResult directly but we have rows_change.
493
+ # We need absolute base count.
494
+
495
+ # Helper to get base count
496
+ if spark:
497
+ base_count = (
498
+ spark.read.format("delta")
499
+ .option("versionAsOf", baseline_version)
500
+ .load(table_path)
501
+ .count()
502
+ )
503
+ else:
504
+ from deltalake import DeltaTable
505
+
506
+ dt = DeltaTable(table_path)
507
+ dt.load_version(baseline_version)
508
+ base_count = len(dt.to_pandas())
509
+
510
+ if base_count == 0:
511
+ if diff.rows_change > 0:
512
+ return f"Data volume spike (0 -> {diff.rows_change} rows)"
513
+ return None
514
+
515
+ pct_change = abs(diff.rows_change) / base_count * 100
516
+
517
+ if pct_change > threshold_pct:
518
+ return f"Row count drift: {pct_change:.1f}% change (Threshold: {threshold_pct}%)"
519
+
520
+ return None
@@ -0,0 +1,169 @@
1
+ """
2
+ ODIBI Diff Tools
3
+ ================
4
+
5
+ Compare nodes and runs to identify changes in logic, data, or performance.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Dict, List, Optional
10
+
11
+ from odibi.story.metadata import NodeExecutionMetadata, PipelineStoryMetadata
12
+
13
+
14
+ @dataclass
15
+ class NodeDiffResult:
16
+ """Difference between two node executions."""
17
+
18
+ node_name: str
19
+
20
+ # Status
21
+ status_change: Optional[str] = None # e.g. "success -> failed"
22
+
23
+ # Data
24
+ rows_out_a: int = 0
25
+ rows_out_b: int = 0
26
+ rows_diff: int = 0 # b - a
27
+
28
+ # Schema
29
+ schema_change: bool = False
30
+ columns_added: List[str] = field(default_factory=list)
31
+ columns_removed: List[str] = field(default_factory=list)
32
+
33
+ # Logic
34
+ sql_changed: bool = False
35
+ config_changed: bool = False
36
+ transformation_changed: bool = False
37
+
38
+ # Versioning
39
+ delta_version_change: Optional[str] = None # "v1 -> v2"
40
+
41
+ @property
42
+ def has_drift(self) -> bool:
43
+ """Check if any significant drift occurred."""
44
+ return (
45
+ self.status_change is not None
46
+ or self.schema_change
47
+ or self.sql_changed
48
+ or self.config_changed
49
+ or self.transformation_changed
50
+ )
51
+
52
+
53
+ @dataclass
54
+ class RunDiffResult:
55
+ """Difference between two pipeline runs."""
56
+
57
+ run_id_a: str
58
+ run_id_b: str
59
+
60
+ node_diffs: Dict[str, NodeDiffResult] = field(default_factory=dict)
61
+ nodes_added: List[str] = field(default_factory=list)
62
+ nodes_removed: List[str] = field(default_factory=list)
63
+
64
+ # Impact Analysis
65
+ drift_source_nodes: List[str] = field(default_factory=list)
66
+ impacted_downstream_nodes: List[str] = field(default_factory=list)
67
+
68
+
69
+ def diff_nodes(node_a: NodeExecutionMetadata, node_b: NodeExecutionMetadata) -> NodeDiffResult:
70
+ """
71
+ Compare two executions of the same node.
72
+
73
+ Args:
74
+ node_a: Baseline execution (Run A)
75
+ node_b: Current execution (Run B)
76
+
77
+ Returns:
78
+ NodeDiffResult
79
+ """
80
+ result = NodeDiffResult(
81
+ node_name=node_a.node_name, rows_out_a=node_a.rows_out or 0, rows_out_b=node_b.rows_out or 0
82
+ )
83
+
84
+ result.rows_diff = result.rows_out_b - result.rows_out_a
85
+
86
+ # Status check
87
+ if node_a.status != node_b.status:
88
+ result.status_change = f"{node_a.status} -> {node_b.status}"
89
+
90
+ # Schema check
91
+ schema_a = set(node_a.schema_out or [])
92
+ schema_b = set(node_b.schema_out or [])
93
+
94
+ if schema_a != schema_b:
95
+ result.schema_change = True
96
+ result.columns_added = list(schema_b - schema_a)
97
+ result.columns_removed = list(schema_a - schema_b)
98
+
99
+ # Logic check (SQL)
100
+ # Prefer Hash comparison if available
101
+ if node_a.sql_hash and node_b.sql_hash:
102
+ if node_a.sql_hash != node_b.sql_hash:
103
+ result.sql_changed = True
104
+ elif node_a.executed_sql != node_b.executed_sql:
105
+ # Fallback to list comparison
106
+ result.sql_changed = True
107
+
108
+ # Transformation Stack Check
109
+ if node_a.transformation_stack != node_b.transformation_stack:
110
+ result.transformation_changed = True
111
+
112
+ # Config check
113
+ # Note: dict comparison handles order if python >= 3.7
114
+ if node_a.config_snapshot and node_b.config_snapshot:
115
+ # Deep compare
116
+ # We might want to exclude timestamps or dynamic fields if they leak into config
117
+ if node_a.config_snapshot != node_b.config_snapshot:
118
+ result.config_changed = True
119
+
120
+ # Delta Version check
121
+ ver_a = node_a.delta_info.version if node_a.delta_info else None
122
+ ver_b = node_b.delta_info.version if node_b.delta_info else None
123
+
124
+ if ver_a is not None and ver_b is not None and ver_a != ver_b:
125
+ result.delta_version_change = f"v{ver_a} -> v{ver_b}"
126
+
127
+ return result
128
+
129
+
130
+ def diff_runs(run_a: PipelineStoryMetadata, run_b: PipelineStoryMetadata) -> RunDiffResult:
131
+ """
132
+ Compare two pipeline runs node by node.
133
+
134
+ Args:
135
+ run_a: Baseline run (Previous)
136
+ run_b: Current run (New)
137
+
138
+ Returns:
139
+ RunDiffResult
140
+ """
141
+ result = RunDiffResult(
142
+ run_id_a=getattr(run_a, "run_id", "unknown"), run_id_b=getattr(run_b, "run_id", "unknown")
143
+ )
144
+
145
+ # Index nodes by name
146
+ nodes_a = {n.node_name: n for n in run_a.nodes}
147
+ nodes_b = {n.node_name: n for n in run_b.nodes}
148
+
149
+ set_a = set(nodes_a.keys())
150
+ set_b = set(nodes_b.keys())
151
+
152
+ result.nodes_added = list(set_b - set_a)
153
+ result.nodes_removed = list(set_a - set_b)
154
+
155
+ common_nodes = set_a.intersection(set_b)
156
+
157
+ for name in common_nodes:
158
+ diff = diff_nodes(nodes_a[name], nodes_b[name])
159
+ result.node_diffs[name] = diff
160
+
161
+ if diff.has_drift or diff.sql_changed or diff.config_changed:
162
+ # logic change implies source of drift
163
+ if diff.sql_changed or diff.config_changed:
164
+ result.drift_source_nodes.append(name)
165
+ else:
166
+ # just data drift/impact
167
+ result.impacted_downstream_nodes.append(name)
168
+
169
+ return result