odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta Lake Diagnostics
|
|
3
|
+
======================
|
|
4
|
+
|
|
5
|
+
Tools for analyzing Delta Lake tables, history, and drift.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class DeltaDiffResult:
|
|
14
|
+
"""Result of comparing two Delta table versions."""
|
|
15
|
+
|
|
16
|
+
table_path: str
|
|
17
|
+
version_a: int
|
|
18
|
+
version_b: int
|
|
19
|
+
|
|
20
|
+
# Metadata changes
|
|
21
|
+
rows_change: int
|
|
22
|
+
files_change: int
|
|
23
|
+
size_change_bytes: int
|
|
24
|
+
|
|
25
|
+
# Schema changes
|
|
26
|
+
schema_added: List[str]
|
|
27
|
+
schema_removed: List[str]
|
|
28
|
+
|
|
29
|
+
schema_current: Optional[List[str]] = None
|
|
30
|
+
schema_previous: Optional[List[str]] = None
|
|
31
|
+
|
|
32
|
+
rows_added: Optional[int] = None
|
|
33
|
+
rows_removed: Optional[int] = None
|
|
34
|
+
rows_updated: Optional[int] = None
|
|
35
|
+
|
|
36
|
+
# Operation info
|
|
37
|
+
operations: List[str] = None # List of operations that happened between versions
|
|
38
|
+
|
|
39
|
+
# Data Diff Samples (Optional)
|
|
40
|
+
sample_added: Optional[List[Dict[str, Any]]] = None
|
|
41
|
+
sample_removed: Optional[List[Dict[str, Any]]] = None
|
|
42
|
+
sample_updated: Optional[List[Dict[str, Any]]] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_delta_diff(
|
|
46
|
+
table_path: str,
|
|
47
|
+
version_a: int,
|
|
48
|
+
version_b: int,
|
|
49
|
+
spark: Optional[Any] = None,
|
|
50
|
+
deep: bool = False,
|
|
51
|
+
keys: Optional[List[str]] = None,
|
|
52
|
+
) -> DeltaDiffResult:
|
|
53
|
+
"""
|
|
54
|
+
Compare two versions of a Delta table.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
table_path: Path to Delta table
|
|
58
|
+
version_a: Start version
|
|
59
|
+
version_b: End version
|
|
60
|
+
spark: Optional SparkSession. If None, uses deltalake (Pandas).
|
|
61
|
+
deep: If True, perform expensive row-by-row comparison (exceptAll).
|
|
62
|
+
If False, rely on metadata and stats.
|
|
63
|
+
keys: List of primary key columns for detecting updates.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
DeltaDiffResult object
|
|
67
|
+
"""
|
|
68
|
+
if spark:
|
|
69
|
+
return _get_delta_diff_spark(spark, table_path, version_a, version_b, deep, keys)
|
|
70
|
+
else:
|
|
71
|
+
return _get_delta_diff_pandas(table_path, version_a, version_b, deep, keys)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _get_delta_diff_spark(
|
|
75
|
+
spark: Any,
|
|
76
|
+
table_path: str,
|
|
77
|
+
version_a: int,
|
|
78
|
+
version_b: int,
|
|
79
|
+
deep: bool = False,
|
|
80
|
+
keys: Optional[List[str]] = None,
|
|
81
|
+
) -> DeltaDiffResult:
|
|
82
|
+
"""Spark implementation of delta diff."""
|
|
83
|
+
try:
|
|
84
|
+
from delta.tables import DeltaTable
|
|
85
|
+
except ImportError:
|
|
86
|
+
raise ImportError("Delta Lake support requires 'delta-spark'")
|
|
87
|
+
|
|
88
|
+
dt = DeltaTable.forPath(spark, table_path)
|
|
89
|
+
history = dt.history().collect()
|
|
90
|
+
|
|
91
|
+
# Filter history between versions
|
|
92
|
+
# We want everything happening AFTER version_a up to version_b
|
|
93
|
+
# History is usually reverse ordered, but let's filter safely
|
|
94
|
+
relevant_commits = [
|
|
95
|
+
row
|
|
96
|
+
for row in history
|
|
97
|
+
if min(version_a, version_b) < row["version"] <= max(version_a, version_b)
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
operations = [row["operation"] for row in relevant_commits]
|
|
101
|
+
|
|
102
|
+
# Calculate expected row changes from metrics if available
|
|
103
|
+
rows_change = 0
|
|
104
|
+
files_change = 0
|
|
105
|
+
bytes_change = 0
|
|
106
|
+
|
|
107
|
+
for commit in relevant_commits:
|
|
108
|
+
metrics = commit.get("operationMetrics", {}) or {}
|
|
109
|
+
|
|
110
|
+
# This is heuristic based on operation type, but usually:
|
|
111
|
+
# Inserted - Deleted
|
|
112
|
+
inserted = int(metrics.get("numTargetRowsInserted", 0) or metrics.get("numOutputRows", 0))
|
|
113
|
+
deleted = int(metrics.get("numTargetRowsDeleted", 0))
|
|
114
|
+
|
|
115
|
+
# Direction matters. If we go a -> b and b > a, we sum up.
|
|
116
|
+
# If b < a, we revert. Assuming a < b here for simplicity of diff
|
|
117
|
+
factor = 1 if version_b > version_a else -1
|
|
118
|
+
|
|
119
|
+
rows_change += (inserted - deleted) * factor
|
|
120
|
+
|
|
121
|
+
# Files
|
|
122
|
+
files_added = int(metrics.get("numFilesAdded", 0) or metrics.get("numAddedFiles", 0))
|
|
123
|
+
files_removed = int(metrics.get("numFilesRemoved", 0) or metrics.get("numRemovedFiles", 0))
|
|
124
|
+
files_change += (files_added - files_removed) * factor
|
|
125
|
+
|
|
126
|
+
# Bytes
|
|
127
|
+
bytes_added = int(metrics.get("numBytesAdded", 0) or metrics.get("numAddedBytes", 0))
|
|
128
|
+
bytes_removed = int(metrics.get("numBytesRemoved", 0) or metrics.get("numRemovedBytes", 0))
|
|
129
|
+
bytes_change += (bytes_added - bytes_removed) * factor
|
|
130
|
+
|
|
131
|
+
# Get snapshots for schema
|
|
132
|
+
# Note: Spark is lazy, so defining DF is cheap, but we need schema.
|
|
133
|
+
# We can get schema from history? No, only from snapshot.
|
|
134
|
+
df_a = spark.read.format("delta").option("versionAsOf", version_a).load(table_path)
|
|
135
|
+
df_b = spark.read.format("delta").option("versionAsOf", version_b).load(table_path)
|
|
136
|
+
|
|
137
|
+
schema_a = set(df_a.columns)
|
|
138
|
+
schema_b = set(df_b.columns)
|
|
139
|
+
|
|
140
|
+
# Deep Diff Logic
|
|
141
|
+
added_rows = None
|
|
142
|
+
removed_rows = None
|
|
143
|
+
updated_rows = None
|
|
144
|
+
rows_added_count = None
|
|
145
|
+
rows_removed_count = None
|
|
146
|
+
rows_updated_count = None
|
|
147
|
+
|
|
148
|
+
if deep:
|
|
149
|
+
# Actual row counts (authoritative vs metrics heuristic)
|
|
150
|
+
rows_a = df_a.count()
|
|
151
|
+
rows_b = df_b.count()
|
|
152
|
+
rows_change = rows_b - rows_a # Override heuristic
|
|
153
|
+
|
|
154
|
+
common_cols = list(schema_a.intersection(schema_b))
|
|
155
|
+
if common_cols:
|
|
156
|
+
df_a_common = df_a.select(*common_cols)
|
|
157
|
+
df_b_common = df_b.select(*common_cols)
|
|
158
|
+
|
|
159
|
+
if keys and set(keys).issubset(common_cols):
|
|
160
|
+
# --- Spark Key-Based Diff ---
|
|
161
|
+
# Join on keys to find Added, Removed, and Updated
|
|
162
|
+
|
|
163
|
+
# 1. Added: In B but not in A (based on keys)
|
|
164
|
+
# df_b_common left_anti df_a_common on keys
|
|
165
|
+
diff_added = df_b_common.join(df_a_common, keys, "left_anti")
|
|
166
|
+
rows_added_count = diff_added.count()
|
|
167
|
+
added_rows = [row.asDict() for row in diff_added.limit(10).collect()]
|
|
168
|
+
|
|
169
|
+
# 2. Removed: In A but not in B (based on keys)
|
|
170
|
+
# df_a_common left_anti df_b_common on keys
|
|
171
|
+
diff_removed = df_a_common.join(df_b_common, keys, "left_anti")
|
|
172
|
+
rows_removed_count = diff_removed.count()
|
|
173
|
+
removed_rows = [row.asDict() for row in diff_removed.limit(10).collect()]
|
|
174
|
+
|
|
175
|
+
# 3. Updates: In both (inner join), but value columns differ
|
|
176
|
+
value_cols = [c for c in common_cols if c not in keys]
|
|
177
|
+
|
|
178
|
+
# Rename columns in A to avoid ambiguity
|
|
179
|
+
# We can alias DataFrames
|
|
180
|
+
df_a_aliased = df_a_common.alias("a")
|
|
181
|
+
df_b_aliased = df_b_common.alias("b")
|
|
182
|
+
|
|
183
|
+
# Build filter condition
|
|
184
|
+
from pyspark.sql import functions as F
|
|
185
|
+
|
|
186
|
+
# Start with False
|
|
187
|
+
change_condition = F.lit(False)
|
|
188
|
+
|
|
189
|
+
for col in value_cols:
|
|
190
|
+
# logical_or of existing condition AND (col_a != col_b)
|
|
191
|
+
# utilizing equalNullSafe inverted: not(a <=> b)
|
|
192
|
+
col_changed = ~F.col(f"a.{col}").eqNullSafe(F.col(f"b.{col}"))
|
|
193
|
+
change_condition = change_condition | col_changed
|
|
194
|
+
|
|
195
|
+
# Inner Join + Filter
|
|
196
|
+
# Join condition is equality on keys
|
|
197
|
+
join_cond = [F.col(f"a.{k}") == F.col(f"b.{k}") for k in keys]
|
|
198
|
+
|
|
199
|
+
diff_updated = (
|
|
200
|
+
df_b_aliased.join(df_a_aliased, join_cond, "inner")
|
|
201
|
+
.filter(change_condition)
|
|
202
|
+
.select("b.*") # We return the 'new' state
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
rows_updated_count = diff_updated.count()
|
|
206
|
+
|
|
207
|
+
# Let's grab the top 10 updated rows (new state)
|
|
208
|
+
updated_rows = [row.asDict() for row in diff_updated.limit(10).collect()]
|
|
209
|
+
|
|
210
|
+
else:
|
|
211
|
+
# Fallback to Set Diff if keys not supported/implemented fully for Spark yet
|
|
212
|
+
# or if keys not provided
|
|
213
|
+
diff_added = df_b_common.exceptAll(df_a_common)
|
|
214
|
+
diff_removed = df_a_common.exceptAll(df_b_common)
|
|
215
|
+
|
|
216
|
+
# Get counts
|
|
217
|
+
rows_added_count = diff_added.count()
|
|
218
|
+
rows_removed_count = diff_removed.count()
|
|
219
|
+
|
|
220
|
+
added_rows = [row.asDict() for row in diff_added.limit(10).collect()]
|
|
221
|
+
removed_rows = [row.asDict() for row in diff_removed.limit(10).collect()]
|
|
222
|
+
|
|
223
|
+
return DeltaDiffResult(
|
|
224
|
+
table_path=table_path,
|
|
225
|
+
version_a=version_a,
|
|
226
|
+
version_b=version_b,
|
|
227
|
+
rows_change=rows_change,
|
|
228
|
+
files_change=files_change,
|
|
229
|
+
size_change_bytes=bytes_change,
|
|
230
|
+
schema_added=list(schema_b - schema_a),
|
|
231
|
+
schema_removed=list(schema_a - schema_b),
|
|
232
|
+
schema_current=sorted(list(schema_b)),
|
|
233
|
+
schema_previous=sorted(list(schema_a)),
|
|
234
|
+
rows_added=rows_added_count,
|
|
235
|
+
rows_removed=rows_removed_count,
|
|
236
|
+
rows_updated=rows_updated_count,
|
|
237
|
+
sample_added=added_rows,
|
|
238
|
+
sample_removed=removed_rows,
|
|
239
|
+
sample_updated=updated_rows,
|
|
240
|
+
operations_between=operations,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _get_delta_diff_pandas(
|
|
245
|
+
table_path: str,
|
|
246
|
+
version_a: int,
|
|
247
|
+
version_b: int,
|
|
248
|
+
deep: bool = False,
|
|
249
|
+
keys: Optional[List[str]] = None,
|
|
250
|
+
) -> DeltaDiffResult:
|
|
251
|
+
"""Pandas (deltalake) implementation of delta diff."""
|
|
252
|
+
try:
|
|
253
|
+
import pandas as pd
|
|
254
|
+
from deltalake import DeltaTable
|
|
255
|
+
except ImportError:
|
|
256
|
+
raise ImportError("Delta Lake support requires 'deltalake' and 'pandas'")
|
|
257
|
+
|
|
258
|
+
dt = DeltaTable(table_path)
|
|
259
|
+
|
|
260
|
+
# History
|
|
261
|
+
history = dt.history()
|
|
262
|
+
relevant_commits = [
|
|
263
|
+
h for h in history if min(version_a, version_b) < h["version"] <= max(version_a, version_b)
|
|
264
|
+
]
|
|
265
|
+
operations = [h["operation"] for h in relevant_commits]
|
|
266
|
+
|
|
267
|
+
# Heuristics for metrics not easily available in pandas wrapper directly per commit object in standard history
|
|
268
|
+
# But we can just use len() since we load the table anyway in pandas logic?
|
|
269
|
+
# Wait, loading entire table in pandas is expensive.
|
|
270
|
+
# deltalake supports 'file_uris()' which is cheap.
|
|
271
|
+
|
|
272
|
+
# Snapshots
|
|
273
|
+
dt.load_as_version(version_a)
|
|
274
|
+
# Getting schema without loading data?
|
|
275
|
+
# Check for API availability (breaking changes in deltalake 0.15+)
|
|
276
|
+
schema_obj = dt.schema()
|
|
277
|
+
if hasattr(schema_obj, "to_pyarrow"):
|
|
278
|
+
arrow_schema_a = schema_obj.to_pyarrow()
|
|
279
|
+
else:
|
|
280
|
+
arrow_schema_a = schema_obj.to_arrow()
|
|
281
|
+
|
|
282
|
+
schema_a = set(arrow_schema_a.names)
|
|
283
|
+
|
|
284
|
+
# For row count without loading:
|
|
285
|
+
# dt.to_pyarrow_dataset().count_rows() ??
|
|
286
|
+
# Currently deltalake 0.10+ has rudimentary stats.
|
|
287
|
+
# Let's assume for Pandas local execution, data is small enough to load OR we skip stats.
|
|
288
|
+
# Actually, let's just load head(0) for schema if possible? No, dt.to_pandas() loads all.
|
|
289
|
+
|
|
290
|
+
# Optimization: Use pyarrow dataset scanner count if available
|
|
291
|
+
try:
|
|
292
|
+
rows_a = len(dt.to_pandas()) # Fallback for now
|
|
293
|
+
except Exception:
|
|
294
|
+
rows_a = 0
|
|
295
|
+
|
|
296
|
+
# If deep=False, we might want to avoid to_pandas().
|
|
297
|
+
# But `deltalake` lib is optimized for single node.
|
|
298
|
+
# Let's assume we load it if we can.
|
|
299
|
+
df_a = dt.to_pandas()
|
|
300
|
+
|
|
301
|
+
dt.load_as_version(version_b)
|
|
302
|
+
df_b = dt.to_pandas()
|
|
303
|
+
|
|
304
|
+
rows_b = len(df_b)
|
|
305
|
+
schema_b = set(df_b.columns)
|
|
306
|
+
|
|
307
|
+
rows_change = rows_b - rows_a
|
|
308
|
+
|
|
309
|
+
added_rows = None
|
|
310
|
+
removed_rows = None
|
|
311
|
+
updated_rows = None
|
|
312
|
+
rows_added_count = None
|
|
313
|
+
rows_removed_count = None
|
|
314
|
+
rows_updated_count = None
|
|
315
|
+
|
|
316
|
+
if deep:
|
|
317
|
+
# Compute Data Diff
|
|
318
|
+
# Pandas doesn't have exceptAll. We use merge with indicator.
|
|
319
|
+
common_cols = list(schema_a.intersection(schema_b))
|
|
320
|
+
|
|
321
|
+
if common_cols:
|
|
322
|
+
# DO NOT restrict inputs to common_cols yet, or we lose new/old data for samples
|
|
323
|
+
|
|
324
|
+
if keys and set(keys).issubset(common_cols):
|
|
325
|
+
# --- KEY-BASED DIFF (Updates Supported) ---
|
|
326
|
+
# Outer merge on KEYS only
|
|
327
|
+
merged = df_b.merge(
|
|
328
|
+
df_a, on=keys, how="outer", suffixes=("", "_old"), indicator=True
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Added: Key in B only
|
|
332
|
+
added_df = merged[merged["_merge"] == "left_only"]
|
|
333
|
+
|
|
334
|
+
# Removed: Key in A only
|
|
335
|
+
removed_df = merged[merged["_merge"] == "right_only"]
|
|
336
|
+
|
|
337
|
+
# Potential Updates: Key in Both
|
|
338
|
+
both_df = merged[merged["_merge"] == "both"]
|
|
339
|
+
|
|
340
|
+
# For "both", check if value cols changed
|
|
341
|
+
# We need to compare common cols that are not keys
|
|
342
|
+
value_cols = [c for c in common_cols if c not in keys]
|
|
343
|
+
|
|
344
|
+
updated_records = []
|
|
345
|
+
|
|
346
|
+
for _, row in both_df.iterrows():
|
|
347
|
+
changes = {}
|
|
348
|
+
has_change = False
|
|
349
|
+
for col in value_cols:
|
|
350
|
+
new_val = row[col]
|
|
351
|
+
old_val = row[f"{col}_old"]
|
|
352
|
+
|
|
353
|
+
# Handle nulls/NaN equality
|
|
354
|
+
if pd.isna(new_val) and pd.isna(old_val):
|
|
355
|
+
continue
|
|
356
|
+
if new_val != old_val:
|
|
357
|
+
changes[col] = {"old": old_val, "new": new_val}
|
|
358
|
+
has_change = True
|
|
359
|
+
|
|
360
|
+
if has_change:
|
|
361
|
+
# Build a record that has Keys + Changes
|
|
362
|
+
rec = {k: row[k] for k in keys}
|
|
363
|
+
rec["_changes"] = changes
|
|
364
|
+
updated_records.append(rec)
|
|
365
|
+
|
|
366
|
+
rows_added_count = len(added_df)
|
|
367
|
+
rows_removed_count = len(removed_df)
|
|
368
|
+
rows_updated_count = len(updated_records)
|
|
369
|
+
|
|
370
|
+
# Format added/removed to regular dicts (drop _old cols and _merge)
|
|
371
|
+
# For Added, we want ALL columns in B (schema_b)
|
|
372
|
+
# Note: added_df comes from df_b mostly, but merged might have _old cols (NaNs)
|
|
373
|
+
# We select columns that are in schema_b
|
|
374
|
+
cols_b = list(schema_b)
|
|
375
|
+
added_rows = added_df[cols_b].head(10).to_dict("records")
|
|
376
|
+
|
|
377
|
+
# For Removed, we want ALL columns in A (schema_a)
|
|
378
|
+
# 'removed_df' has columns from B (NaN) and columns from A (with _old suffix usually, OR common ones)
|
|
379
|
+
# Wait, merge suffixes apply to overlapping columns.
|
|
380
|
+
# Keys are shared.
|
|
381
|
+
# Columns unique to B are present (NaN).
|
|
382
|
+
# Columns unique to A are present?
|
|
383
|
+
# If unique to A (dropped col), it's in df_a but not df_b.
|
|
384
|
+
# Merge retains it. Does it have suffix?
|
|
385
|
+
# No, if not in df_b, no collision -> no suffix.
|
|
386
|
+
# BUT, common columns have collision -> suffix.
|
|
387
|
+
|
|
388
|
+
# Reconstruct deleted row:
|
|
389
|
+
# 1. Keys (no suffix)
|
|
390
|
+
# 2. Common non-keys (suffix _old)
|
|
391
|
+
# 3. Unique to A (no suffix)
|
|
392
|
+
removed_clean = []
|
|
393
|
+
for _, row in removed_df.head(10).iterrows():
|
|
394
|
+
rec = {}
|
|
395
|
+
for col in schema_a:
|
|
396
|
+
if col in keys:
|
|
397
|
+
rec[col] = row[col]
|
|
398
|
+
elif col in common_cols:
|
|
399
|
+
# It was common, so it collided. In right_only, we want the 'right' version.
|
|
400
|
+
# Suffix applied to 'left' (B) is "" and 'right' (A) is "_old".
|
|
401
|
+
rec[col] = row[f"{col}_old"]
|
|
402
|
+
else:
|
|
403
|
+
# Unique to A (deleted column). No collision.
|
|
404
|
+
if col in row:
|
|
405
|
+
rec[col] = row[col]
|
|
406
|
+
removed_clean.append(rec)
|
|
407
|
+
removed_rows = removed_clean
|
|
408
|
+
|
|
409
|
+
updated_rows = updated_records[:10]
|
|
410
|
+
|
|
411
|
+
else:
|
|
412
|
+
# --- SET-BASED DIFF (No Keys) ---
|
|
413
|
+
# Merge on all common columns
|
|
414
|
+
# Note: We can't easily detect updates here, just Add/Remove
|
|
415
|
+
# If we merge on common_cols, we find rows that match on those.
|
|
416
|
+
merged = df_b.merge(df_a, on=common_cols, how="outer", indicator=True)
|
|
417
|
+
|
|
418
|
+
# Rows only in B (New/Added) -> left_only
|
|
419
|
+
added_df = merged[merged["_merge"] == "left_only"]
|
|
420
|
+
|
|
421
|
+
# Rows only in A (Old/Removed) -> right_only
|
|
422
|
+
removed_df = merged[merged["_merge"] == "right_only"]
|
|
423
|
+
|
|
424
|
+
rows_added_count = len(added_df)
|
|
425
|
+
rows_removed_count = len(removed_df)
|
|
426
|
+
|
|
427
|
+
# For Added, show columns from B
|
|
428
|
+
cols_b = list(schema_b)
|
|
429
|
+
# Filter to cols_b that exist in merged (should be all)
|
|
430
|
+
# Note: merged might have duplicate columns if not in 'on' list?
|
|
431
|
+
# Yes, if B has col X and A has col X, and X is NOT in common_cols (impossible by def), it would duplicate.
|
|
432
|
+
# Columns in B but not A (Added cols) -> No collision -> Present.
|
|
433
|
+
# Columns in common -> Joined -> Present.
|
|
434
|
+
added_rows = added_df[cols_b].head(10).to_dict("records")
|
|
435
|
+
|
|
436
|
+
# For Removed, show columns from A
|
|
437
|
+
cols_a = list(schema_a)
|
|
438
|
+
removed_rows = removed_df[cols_a].head(10).to_dict("records")
|
|
439
|
+
|
|
440
|
+
return DeltaDiffResult(
|
|
441
|
+
table_path=table_path,
|
|
442
|
+
version_a=version_a,
|
|
443
|
+
version_b=version_b,
|
|
444
|
+
rows_change=rows_change,
|
|
445
|
+
files_change=0,
|
|
446
|
+
size_change_bytes=0,
|
|
447
|
+
schema_added=list(schema_b - schema_a),
|
|
448
|
+
schema_removed=list(schema_a - schema_b),
|
|
449
|
+
schema_current=sorted(list(schema_b)),
|
|
450
|
+
schema_previous=sorted(list(schema_a)),
|
|
451
|
+
rows_added=rows_added_count,
|
|
452
|
+
rows_removed=rows_removed_count,
|
|
453
|
+
rows_updated=rows_updated_count,
|
|
454
|
+
sample_added=added_rows,
|
|
455
|
+
sample_removed=removed_rows,
|
|
456
|
+
sample_updated=updated_rows,
|
|
457
|
+
operations=operations,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def detect_drift(
|
|
462
|
+
table_path: str,
|
|
463
|
+
current_version: int,
|
|
464
|
+
baseline_version: int,
|
|
465
|
+
spark: Optional[Any] = None,
|
|
466
|
+
threshold_pct: float = 10.0,
|
|
467
|
+
) -> Optional[str]:
|
|
468
|
+
"""
|
|
469
|
+
Check for significant drift between versions.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
table_path: Path to Delta table
|
|
473
|
+
current_version: Current version
|
|
474
|
+
baseline_version: Baseline version
|
|
475
|
+
spark: Optional SparkSession
|
|
476
|
+
threshold_pct: Row count change percentage to trigger warning
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Warning message if drift detected, None otherwise
|
|
480
|
+
"""
|
|
481
|
+
diff = get_delta_diff(table_path, baseline_version, current_version, spark=spark)
|
|
482
|
+
|
|
483
|
+
# Check schema drift
|
|
484
|
+
if diff.schema_added or diff.schema_removed:
|
|
485
|
+
return (
|
|
486
|
+
f"Schema drift detected: "
|
|
487
|
+
f"+{len(diff.schema_added)} columns, -{len(diff.schema_removed)} columns"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# For row count baseline, we can calculate it from current - change?
|
|
491
|
+
# Or read it again.
|
|
492
|
+
# Let's optimize: we don't have base_count in DiffResult directly but we have rows_change.
|
|
493
|
+
# We need absolute base count.
|
|
494
|
+
|
|
495
|
+
# Helper to get base count
|
|
496
|
+
if spark:
|
|
497
|
+
base_count = (
|
|
498
|
+
spark.read.format("delta")
|
|
499
|
+
.option("versionAsOf", baseline_version)
|
|
500
|
+
.load(table_path)
|
|
501
|
+
.count()
|
|
502
|
+
)
|
|
503
|
+
else:
|
|
504
|
+
from deltalake import DeltaTable
|
|
505
|
+
|
|
506
|
+
dt = DeltaTable(table_path)
|
|
507
|
+
dt.load_version(baseline_version)
|
|
508
|
+
base_count = len(dt.to_pandas())
|
|
509
|
+
|
|
510
|
+
if base_count == 0:
|
|
511
|
+
if diff.rows_change > 0:
|
|
512
|
+
return f"Data volume spike (0 -> {diff.rows_change} rows)"
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
pct_change = abs(diff.rows_change) / base_count * 100
|
|
516
|
+
|
|
517
|
+
if pct_change > threshold_pct:
|
|
518
|
+
return f"Row count drift: {pct_change:.1f}% change (Threshold: {threshold_pct}%)"
|
|
519
|
+
|
|
520
|
+
return None
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ODIBI Diff Tools
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Compare nodes and runs to identify changes in logic, data, or performance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from odibi.story.metadata import NodeExecutionMetadata, PipelineStoryMetadata
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class NodeDiffResult:
|
|
16
|
+
"""Difference between two node executions."""
|
|
17
|
+
|
|
18
|
+
node_name: str
|
|
19
|
+
|
|
20
|
+
# Status
|
|
21
|
+
status_change: Optional[str] = None # e.g. "success -> failed"
|
|
22
|
+
|
|
23
|
+
# Data
|
|
24
|
+
rows_out_a: int = 0
|
|
25
|
+
rows_out_b: int = 0
|
|
26
|
+
rows_diff: int = 0 # b - a
|
|
27
|
+
|
|
28
|
+
# Schema
|
|
29
|
+
schema_change: bool = False
|
|
30
|
+
columns_added: List[str] = field(default_factory=list)
|
|
31
|
+
columns_removed: List[str] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
# Logic
|
|
34
|
+
sql_changed: bool = False
|
|
35
|
+
config_changed: bool = False
|
|
36
|
+
transformation_changed: bool = False
|
|
37
|
+
|
|
38
|
+
# Versioning
|
|
39
|
+
delta_version_change: Optional[str] = None # "v1 -> v2"
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def has_drift(self) -> bool:
|
|
43
|
+
"""Check if any significant drift occurred."""
|
|
44
|
+
return (
|
|
45
|
+
self.status_change is not None
|
|
46
|
+
or self.schema_change
|
|
47
|
+
or self.sql_changed
|
|
48
|
+
or self.config_changed
|
|
49
|
+
or self.transformation_changed
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class RunDiffResult:
|
|
55
|
+
"""Difference between two pipeline runs."""
|
|
56
|
+
|
|
57
|
+
run_id_a: str
|
|
58
|
+
run_id_b: str
|
|
59
|
+
|
|
60
|
+
node_diffs: Dict[str, NodeDiffResult] = field(default_factory=dict)
|
|
61
|
+
nodes_added: List[str] = field(default_factory=list)
|
|
62
|
+
nodes_removed: List[str] = field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
# Impact Analysis
|
|
65
|
+
drift_source_nodes: List[str] = field(default_factory=list)
|
|
66
|
+
impacted_downstream_nodes: List[str] = field(default_factory=list)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def diff_nodes(node_a: NodeExecutionMetadata, node_b: NodeExecutionMetadata) -> NodeDiffResult:
|
|
70
|
+
"""
|
|
71
|
+
Compare two executions of the same node.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
node_a: Baseline execution (Run A)
|
|
75
|
+
node_b: Current execution (Run B)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
NodeDiffResult
|
|
79
|
+
"""
|
|
80
|
+
result = NodeDiffResult(
|
|
81
|
+
node_name=node_a.node_name, rows_out_a=node_a.rows_out or 0, rows_out_b=node_b.rows_out or 0
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
result.rows_diff = result.rows_out_b - result.rows_out_a
|
|
85
|
+
|
|
86
|
+
# Status check
|
|
87
|
+
if node_a.status != node_b.status:
|
|
88
|
+
result.status_change = f"{node_a.status} -> {node_b.status}"
|
|
89
|
+
|
|
90
|
+
# Schema check
|
|
91
|
+
schema_a = set(node_a.schema_out or [])
|
|
92
|
+
schema_b = set(node_b.schema_out or [])
|
|
93
|
+
|
|
94
|
+
if schema_a != schema_b:
|
|
95
|
+
result.schema_change = True
|
|
96
|
+
result.columns_added = list(schema_b - schema_a)
|
|
97
|
+
result.columns_removed = list(schema_a - schema_b)
|
|
98
|
+
|
|
99
|
+
# Logic check (SQL)
|
|
100
|
+
# Prefer Hash comparison if available
|
|
101
|
+
if node_a.sql_hash and node_b.sql_hash:
|
|
102
|
+
if node_a.sql_hash != node_b.sql_hash:
|
|
103
|
+
result.sql_changed = True
|
|
104
|
+
elif node_a.executed_sql != node_b.executed_sql:
|
|
105
|
+
# Fallback to list comparison
|
|
106
|
+
result.sql_changed = True
|
|
107
|
+
|
|
108
|
+
# Transformation Stack Check
|
|
109
|
+
if node_a.transformation_stack != node_b.transformation_stack:
|
|
110
|
+
result.transformation_changed = True
|
|
111
|
+
|
|
112
|
+
# Config check
|
|
113
|
+
# Note: dict comparison handles order if python >= 3.7
|
|
114
|
+
if node_a.config_snapshot and node_b.config_snapshot:
|
|
115
|
+
# Deep compare
|
|
116
|
+
# We might want to exclude timestamps or dynamic fields if they leak into config
|
|
117
|
+
if node_a.config_snapshot != node_b.config_snapshot:
|
|
118
|
+
result.config_changed = True
|
|
119
|
+
|
|
120
|
+
# Delta Version check
|
|
121
|
+
ver_a = node_a.delta_info.version if node_a.delta_info else None
|
|
122
|
+
ver_b = node_b.delta_info.version if node_b.delta_info else None
|
|
123
|
+
|
|
124
|
+
if ver_a is not None and ver_b is not None and ver_a != ver_b:
|
|
125
|
+
result.delta_version_change = f"v{ver_a} -> v{ver_b}"
|
|
126
|
+
|
|
127
|
+
return result
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def diff_runs(run_a: PipelineStoryMetadata, run_b: PipelineStoryMetadata) -> RunDiffResult:
|
|
131
|
+
"""
|
|
132
|
+
Compare two pipeline runs node by node.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
run_a: Baseline run (Previous)
|
|
136
|
+
run_b: Current run (New)
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
RunDiffResult
|
|
140
|
+
"""
|
|
141
|
+
result = RunDiffResult(
|
|
142
|
+
run_id_a=getattr(run_a, "run_id", "unknown"), run_id_b=getattr(run_b, "run_id", "unknown")
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Index nodes by name
|
|
146
|
+
nodes_a = {n.node_name: n for n in run_a.nodes}
|
|
147
|
+
nodes_b = {n.node_name: n for n in run_b.nodes}
|
|
148
|
+
|
|
149
|
+
set_a = set(nodes_a.keys())
|
|
150
|
+
set_b = set(nodes_b.keys())
|
|
151
|
+
|
|
152
|
+
result.nodes_added = list(set_b - set_a)
|
|
153
|
+
result.nodes_removed = list(set_a - set_b)
|
|
154
|
+
|
|
155
|
+
common_nodes = set_a.intersection(set_b)
|
|
156
|
+
|
|
157
|
+
for name in common_nodes:
|
|
158
|
+
diff = diff_nodes(nodes_a[name], nodes_b[name])
|
|
159
|
+
result.node_diffs[name] = diff
|
|
160
|
+
|
|
161
|
+
if diff.has_drift or diff.sql_changed or diff.config_changed:
|
|
162
|
+
# logic change implies source of drift
|
|
163
|
+
if diff.sql_changed or diff.config_changed:
|
|
164
|
+
result.drift_source_nodes.append(name)
|
|
165
|
+
else:
|
|
166
|
+
# just data drift/impact
|
|
167
|
+
result.impacted_downstream_nodes.append(name)
|
|
168
|
+
|
|
169
|
+
return result
|