odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,778 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
7
|
+
|
|
8
|
+
from odibi.context import EngineContext, PandasContext, SparkContext
|
|
9
|
+
from odibi.registry import transform
|
|
10
|
+
from odibi.utils.logging_context import get_logging_context
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from delta.tables import DeltaTable
|
|
14
|
+
except ImportError:
|
|
15
|
+
DeltaTable = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MergeStrategy(str, Enum):
|
|
19
|
+
UPSERT = "upsert"
|
|
20
|
+
APPEND_ONLY = "append_only"
|
|
21
|
+
DELETE_MATCH = "delete_match"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AuditColumnsConfig(BaseModel):
|
|
25
|
+
created_col: Optional[str] = Field(
|
|
26
|
+
default=None, description="Column to set only on first insert"
|
|
27
|
+
)
|
|
28
|
+
updated_col: Optional[str] = Field(default=None, description="Column to update on every merge")
|
|
29
|
+
|
|
30
|
+
@model_validator(mode="after")
|
|
31
|
+
def at_least_one(self):
|
|
32
|
+
if not self.created_col and not self.updated_col:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
"Merge.audit_cols: specify at least one of 'created_col' or 'updated_col'."
|
|
35
|
+
)
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class MergeParams(BaseModel):
|
|
40
|
+
"""
|
|
41
|
+
Configuration for Merge transformer (Upsert/Append).
|
|
42
|
+
|
|
43
|
+
### ⚖️ "GDPR & Compliance" Guide
|
|
44
|
+
|
|
45
|
+
**Business Problem:**
|
|
46
|
+
"A user exercised their 'Right to be Forgotten'. We need to remove them from our Silver tables immediately."
|
|
47
|
+
|
|
48
|
+
**The Solution:**
|
|
49
|
+
Use the `delete_match` strategy. The source dataframe contains the IDs to be deleted, and the transformer removes them from the target.
|
|
50
|
+
|
|
51
|
+
**Recipe 1: Right to be Forgotten (Delete)**
|
|
52
|
+
```yaml
|
|
53
|
+
transformer: "merge"
|
|
54
|
+
params:
|
|
55
|
+
target: "silver.customers"
|
|
56
|
+
keys: ["customer_id"]
|
|
57
|
+
strategy: "delete_match"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Recipe 2: Conditional Update (SCD Type 1)**
|
|
61
|
+
"Only update if the source record is newer than the target record."
|
|
62
|
+
```yaml
|
|
63
|
+
transformer: "merge"
|
|
64
|
+
params:
|
|
65
|
+
target: "silver.products"
|
|
66
|
+
keys: ["product_id"]
|
|
67
|
+
strategy: "upsert"
|
|
68
|
+
update_condition: "source.updated_at > target.updated_at"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**Recipe 3: Safe Insert (Filter Bad Records)**
|
|
72
|
+
"Only insert records that are not marked as deleted."
|
|
73
|
+
```yaml
|
|
74
|
+
transformer: "merge"
|
|
75
|
+
params:
|
|
76
|
+
target: "silver.orders"
|
|
77
|
+
keys: ["order_id"]
|
|
78
|
+
strategy: "append_only"
|
|
79
|
+
insert_condition: "source.is_deleted = false"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Recipe 4: Audit Columns**
|
|
83
|
+
"Track when records were created or updated."
|
|
84
|
+
```yaml
|
|
85
|
+
transformer: "merge"
|
|
86
|
+
params:
|
|
87
|
+
target: "silver.users"
|
|
88
|
+
keys: ["user_id"]
|
|
89
|
+
audit_cols:
|
|
90
|
+
created_col: "dw_created_at"
|
|
91
|
+
updated_col: "dw_updated_at"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Recipe 5: Full Sync (Insert + Update + Delete)**
|
|
95
|
+
"Sync target with source: insert new, update changed, and remove soft-deleted."
|
|
96
|
+
```yaml
|
|
97
|
+
transformer: "merge"
|
|
98
|
+
params:
|
|
99
|
+
target: "silver.customers"
|
|
100
|
+
keys: ["id"]
|
|
101
|
+
strategy: "upsert"
|
|
102
|
+
# 1. Delete if source says so
|
|
103
|
+
delete_condition: "source.is_deleted = true"
|
|
104
|
+
# 2. Update if changed (and not deleted)
|
|
105
|
+
update_condition: "source.hash != target.hash"
|
|
106
|
+
# 3. Insert new (and not deleted)
|
|
107
|
+
insert_condition: "source.is_deleted = false"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
**Recipe 6: Connection-based Path Resolution (ADLS)**
|
|
111
|
+
"Use a connection to resolve paths, just like write config."
|
|
112
|
+
```yaml
|
|
113
|
+
transform:
|
|
114
|
+
steps:
|
|
115
|
+
- function: merge
|
|
116
|
+
params:
|
|
117
|
+
connection: goat_prod
|
|
118
|
+
path: OEE/silver/customers
|
|
119
|
+
register_table: silver.customers
|
|
120
|
+
keys: ["customer_id"]
|
|
121
|
+
strategy: "upsert"
|
|
122
|
+
audit_cols:
|
|
123
|
+
created_col: "_created_at"
|
|
124
|
+
updated_col: "_updated_at"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Strategies:**
|
|
128
|
+
* **upsert** (Default): Update existing records, insert new ones.
|
|
129
|
+
* **append_only**: Ignore duplicates, only insert new keys.
|
|
130
|
+
* **delete_match**: Delete records in target that match keys in source.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
target: Optional[str] = Field(
|
|
134
|
+
None,
|
|
135
|
+
description="Target table name or full path (use this OR connection+path)",
|
|
136
|
+
)
|
|
137
|
+
connection: Optional[str] = Field(
|
|
138
|
+
None,
|
|
139
|
+
description="Connection name to resolve path (use with 'path' param)",
|
|
140
|
+
)
|
|
141
|
+
path: Optional[str] = Field(
|
|
142
|
+
None,
|
|
143
|
+
description="Relative path within connection (e.g., 'OEE/silver/customers')",
|
|
144
|
+
)
|
|
145
|
+
register_table: Optional[str] = Field(
|
|
146
|
+
None,
|
|
147
|
+
description="Register as Unity Catalog/metastore table after merge (e.g., 'silver.customers')",
|
|
148
|
+
)
|
|
149
|
+
keys: List[str] = Field(..., description="List of join keys")
|
|
150
|
+
strategy: MergeStrategy = Field(
|
|
151
|
+
default=MergeStrategy.UPSERT,
|
|
152
|
+
description="Merge behavior: 'upsert', 'append_only', 'delete_match'",
|
|
153
|
+
)
|
|
154
|
+
audit_cols: Optional[AuditColumnsConfig] = Field(
|
|
155
|
+
None, description="{'created_col': '...', 'updated_col': '...'}"
|
|
156
|
+
)
|
|
157
|
+
optimize_write: bool = Field(False, description="Run OPTIMIZE after write (Spark)")
|
|
158
|
+
zorder_by: Optional[List[str]] = Field(None, description="Columns to Z-Order by")
|
|
159
|
+
cluster_by: Optional[List[str]] = Field(
|
|
160
|
+
None, description="Columns to Liquid Cluster by (Delta)"
|
|
161
|
+
)
|
|
162
|
+
update_condition: Optional[str] = Field(
|
|
163
|
+
None, description="SQL condition for update clause (e.g. 'source.ver > target.ver')"
|
|
164
|
+
)
|
|
165
|
+
insert_condition: Optional[str] = Field(
|
|
166
|
+
None, description="SQL condition for insert clause (e.g. 'source.status != \"deleted\"')"
|
|
167
|
+
)
|
|
168
|
+
delete_condition: Optional[str] = Field(
|
|
169
|
+
None, description="SQL condition for delete clause (e.g. 'source.status = \"deleted\"')"
|
|
170
|
+
)
|
|
171
|
+
table_properties: Optional[dict] = Field(
|
|
172
|
+
None,
|
|
173
|
+
description="Delta table properties for initial table creation (e.g., column mapping)",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
@field_validator("keys")
|
|
177
|
+
@classmethod
|
|
178
|
+
def check_keys(cls, v):
|
|
179
|
+
if not v:
|
|
180
|
+
raise ValueError(
|
|
181
|
+
"Merge: 'keys' must not be empty. "
|
|
182
|
+
"Provide at least one column name to join source and target on. "
|
|
183
|
+
f"Got: {v!r}"
|
|
184
|
+
)
|
|
185
|
+
return v
|
|
186
|
+
|
|
187
|
+
@model_validator(mode="after")
|
|
188
|
+
def check_target_or_connection(self):
|
|
189
|
+
"""Ensure either target or connection+path is provided."""
|
|
190
|
+
if not self.target and not (self.connection and self.path):
|
|
191
|
+
raise ValueError("Merge: provide either 'target' OR both 'connection' and 'path'.")
|
|
192
|
+
if self.target and (self.connection or self.path):
|
|
193
|
+
raise ValueError("Merge: use 'target' OR 'connection'+'path', not both.")
|
|
194
|
+
return self
|
|
195
|
+
|
|
196
|
+
@model_validator(mode="after")
|
|
197
|
+
def check_strategy_and_audit(self):
|
|
198
|
+
if self.strategy == MergeStrategy.DELETE_MATCH and self.audit_cols:
|
|
199
|
+
raise ValueError("Merge: 'audit_cols' is not used with strategy='delete_match'.")
|
|
200
|
+
return self
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@transform("merge", category="transformer", param_model=MergeParams)
|
|
204
|
+
def merge(context, params=None, current=None, **kwargs):
|
|
205
|
+
"""
|
|
206
|
+
Merge transformer implementation.
|
|
207
|
+
Handles Upsert, Append-Only, and Delete-Match strategies.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
context: EngineContext (preferred) or legacy PandasContext/SparkContext
|
|
211
|
+
params: MergeParams object (when called via function step) or DataFrame (legacy)
|
|
212
|
+
current: DataFrame (legacy positional arg, deprecated)
|
|
213
|
+
**kwargs: Parameters when not using MergeParams
|
|
214
|
+
"""
|
|
215
|
+
ctx = get_logging_context()
|
|
216
|
+
start_time = time.time()
|
|
217
|
+
|
|
218
|
+
# Handle legacy signature: merge(context, source_df, **params)
|
|
219
|
+
# where params (2nd arg) is actually the DataFrame
|
|
220
|
+
if params is not None and not isinstance(params, MergeParams):
|
|
221
|
+
# Legacy call: params is actually the DataFrame
|
|
222
|
+
current = params
|
|
223
|
+
merge_params = MergeParams(**kwargs)
|
|
224
|
+
elif isinstance(params, MergeParams):
|
|
225
|
+
merge_params = params
|
|
226
|
+
else:
|
|
227
|
+
merge_params = MergeParams(**kwargs)
|
|
228
|
+
|
|
229
|
+
# Get current DataFrame: prefer explicit current, then context.df
|
|
230
|
+
if current is None:
|
|
231
|
+
if hasattr(context, "df"):
|
|
232
|
+
current = context.df
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(
|
|
235
|
+
f"Merge requires a DataFrame but none was provided. "
|
|
236
|
+
f"Either pass a DataFrame as the 'current' argument, or ensure context.df is set. "
|
|
237
|
+
f"Context type: {type(context).__name__}. Has 'df' attr: {hasattr(context, 'df')}."
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Resolve target path from connection if provided
|
|
241
|
+
target = merge_params.target
|
|
242
|
+
register_table = merge_params.register_table
|
|
243
|
+
|
|
244
|
+
if merge_params.connection and merge_params.path:
|
|
245
|
+
# Resolve path via connection
|
|
246
|
+
connection = None
|
|
247
|
+
if hasattr(context, "engine") and hasattr(context.engine, "connections"):
|
|
248
|
+
connections = context.engine.connections
|
|
249
|
+
if connections and merge_params.connection in connections:
|
|
250
|
+
connection = connections[merge_params.connection]
|
|
251
|
+
|
|
252
|
+
if connection is None:
|
|
253
|
+
raise ValueError(
|
|
254
|
+
f"Merge: connection '{merge_params.connection}' not found. "
|
|
255
|
+
"Ensure the connection is defined in your project config."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if hasattr(connection, "get_path"):
|
|
259
|
+
target = connection.get_path(merge_params.path)
|
|
260
|
+
ctx.debug(
|
|
261
|
+
"Resolved merge target path via connection",
|
|
262
|
+
connection=merge_params.connection,
|
|
263
|
+
relative_path=merge_params.path,
|
|
264
|
+
resolved_path=target,
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(
|
|
268
|
+
f"Merge: connection '{merge_params.connection}' (type: {type(connection).__name__}) "
|
|
269
|
+
f"does not support path resolution. Expected a connection with 'get_path' method. "
|
|
270
|
+
f"Connection type must be 'local', 'adls', or similar file-based connection."
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
ctx.debug(
|
|
274
|
+
"Merge starting",
|
|
275
|
+
target=target,
|
|
276
|
+
keys=merge_params.keys,
|
|
277
|
+
strategy=merge_params.strategy.value,
|
|
278
|
+
register_table=register_table,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Get source row count
|
|
282
|
+
rows_before = None
|
|
283
|
+
try:
|
|
284
|
+
rows_before = current.shape[0] if hasattr(current, "shape") else None
|
|
285
|
+
if rows_before is None and hasattr(current, "count"):
|
|
286
|
+
rows_before = current.count()
|
|
287
|
+
except Exception as e:
|
|
288
|
+
ctx.debug(f"Could not get row count: {type(e).__name__}")
|
|
289
|
+
|
|
290
|
+
ctx.debug("Merge source loaded", source_rows=rows_before)
|
|
291
|
+
|
|
292
|
+
# Unwrap EngineContext if present
|
|
293
|
+
real_context = context
|
|
294
|
+
if isinstance(context, EngineContext):
|
|
295
|
+
real_context = context.context
|
|
296
|
+
|
|
297
|
+
keys = merge_params.keys
|
|
298
|
+
strategy = merge_params.strategy
|
|
299
|
+
audit_cols = merge_params.audit_cols
|
|
300
|
+
|
|
301
|
+
# Optimization params
|
|
302
|
+
optimize_write = merge_params.optimize_write
|
|
303
|
+
zorder_by = merge_params.zorder_by
|
|
304
|
+
cluster_by = merge_params.cluster_by
|
|
305
|
+
|
|
306
|
+
if isinstance(real_context, SparkContext):
|
|
307
|
+
result = _merge_spark(
|
|
308
|
+
context,
|
|
309
|
+
current,
|
|
310
|
+
target,
|
|
311
|
+
keys,
|
|
312
|
+
strategy,
|
|
313
|
+
audit_cols,
|
|
314
|
+
optimize_write,
|
|
315
|
+
zorder_by,
|
|
316
|
+
cluster_by,
|
|
317
|
+
merge_params.update_condition,
|
|
318
|
+
merge_params.insert_condition,
|
|
319
|
+
merge_params.delete_condition,
|
|
320
|
+
merge_params.table_properties,
|
|
321
|
+
kwargs,
|
|
322
|
+
)
|
|
323
|
+
elif isinstance(real_context, PandasContext):
|
|
324
|
+
result = _merge_pandas(context, current, target, keys, strategy, audit_cols, kwargs)
|
|
325
|
+
else:
|
|
326
|
+
ctx.error("Merge failed: unsupported context", context_type=str(type(real_context)))
|
|
327
|
+
raise ValueError(f"Unsupported context type: {type(real_context)}")
|
|
328
|
+
|
|
329
|
+
# Register table in metastore if requested (Spark only)
|
|
330
|
+
if register_table and isinstance(real_context, SparkContext):
|
|
331
|
+
try:
|
|
332
|
+
spark = context.spark
|
|
333
|
+
if spark:
|
|
334
|
+
ctx.debug(
|
|
335
|
+
"Registering table in metastore",
|
|
336
|
+
table_name=register_table,
|
|
337
|
+
location=target,
|
|
338
|
+
)
|
|
339
|
+
spark.sql(
|
|
340
|
+
f"CREATE TABLE IF NOT EXISTS {register_table} USING DELTA LOCATION '{target}'"
|
|
341
|
+
)
|
|
342
|
+
ctx.info(
|
|
343
|
+
"Table registered successfully",
|
|
344
|
+
table_name=register_table,
|
|
345
|
+
location=target,
|
|
346
|
+
)
|
|
347
|
+
except Exception as e:
|
|
348
|
+
ctx.warning(
|
|
349
|
+
f"Failed to register table: {e}",
|
|
350
|
+
table_name=register_table,
|
|
351
|
+
error=str(e),
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
355
|
+
ctx.debug(
|
|
356
|
+
"Merge completed",
|
|
357
|
+
target=target,
|
|
358
|
+
strategy=merge_params.strategy.value,
|
|
359
|
+
source_rows=rows_before,
|
|
360
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
return result
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _merge_spark(
|
|
367
|
+
context,
|
|
368
|
+
source_df,
|
|
369
|
+
target,
|
|
370
|
+
keys,
|
|
371
|
+
strategy,
|
|
372
|
+
audit_cols,
|
|
373
|
+
optimize_write,
|
|
374
|
+
zorder_by,
|
|
375
|
+
cluster_by,
|
|
376
|
+
update_condition,
|
|
377
|
+
insert_condition,
|
|
378
|
+
delete_condition,
|
|
379
|
+
table_properties,
|
|
380
|
+
params,
|
|
381
|
+
):
|
|
382
|
+
if DeltaTable is None:
|
|
383
|
+
raise ImportError("Spark Merge Transformer requires 'delta-spark' package.")
|
|
384
|
+
|
|
385
|
+
spark = context.spark
|
|
386
|
+
|
|
387
|
+
# Import Spark functions inside the function to avoid module-level unused imports
|
|
388
|
+
from pyspark.sql.functions import current_timestamp
|
|
389
|
+
|
|
390
|
+
# Add Audit Columns to Source
|
|
391
|
+
if audit_cols:
|
|
392
|
+
created_col = audit_cols.created_col
|
|
393
|
+
updated_col = audit_cols.updated_col
|
|
394
|
+
|
|
395
|
+
if updated_col:
|
|
396
|
+
source_df = source_df.withColumn(updated_col, current_timestamp())
|
|
397
|
+
|
|
398
|
+
if created_col and created_col not in source_df.columns:
|
|
399
|
+
source_df = source_df.withColumn(created_col, current_timestamp())
|
|
400
|
+
|
|
401
|
+
def get_delta_table():
|
|
402
|
+
# Heuristic: if it looks like a path, use forPath, else forName
|
|
403
|
+
# Path indicators: /, \, :, or starts with .
|
|
404
|
+
if "/" in target or "\\" in target or ":" in target or target.startswith("."):
|
|
405
|
+
return DeltaTable.forPath(spark, target)
|
|
406
|
+
return DeltaTable.forName(spark, target)
|
|
407
|
+
|
|
408
|
+
def merge_batch(batch_df, batch_id=None):
|
|
409
|
+
# Check if table exists
|
|
410
|
+
is_delta = False
|
|
411
|
+
try:
|
|
412
|
+
if "/" in target or "\\" in target or ":" in target or target.startswith("."):
|
|
413
|
+
is_delta = DeltaTable.isDeltaTable(spark, target)
|
|
414
|
+
else:
|
|
415
|
+
# For table name, try to access it
|
|
416
|
+
try:
|
|
417
|
+
DeltaTable.forName(spark, target)
|
|
418
|
+
is_delta = True
|
|
419
|
+
except Exception:
|
|
420
|
+
is_delta = False
|
|
421
|
+
except Exception:
|
|
422
|
+
is_delta = False
|
|
423
|
+
|
|
424
|
+
if is_delta:
|
|
425
|
+
delta_table = get_delta_table()
|
|
426
|
+
|
|
427
|
+
condition = " AND ".join([f"target.`{k}` = source.`{k}`" for k in keys])
|
|
428
|
+
merger = delta_table.alias("target").merge(batch_df.alias("source"), condition)
|
|
429
|
+
|
|
430
|
+
orig_auto_merge = None
|
|
431
|
+
if strategy == MergeStrategy.UPSERT:
|
|
432
|
+
# Construct update map
|
|
433
|
+
update_expr = {}
|
|
434
|
+
for col_name in batch_df.columns:
|
|
435
|
+
# Skip created_col in update
|
|
436
|
+
if audit_cols and audit_cols.created_col == col_name:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
# Note: When Delta Merge UPDATE SET uses column names from source that
|
|
440
|
+
# do NOT exist in target, it throws UNRESOLVED_EXPRESSION if schema evolution
|
|
441
|
+
# is not enabled or handled automatically by the merge operation for updates.
|
|
442
|
+
|
|
443
|
+
update_expr[f"`{col_name}`"] = f"source.`{col_name}`"
|
|
444
|
+
|
|
445
|
+
# Enable automatic schema evolution for the merge
|
|
446
|
+
# This is critical for adding new columns (like audit cols)
|
|
447
|
+
|
|
448
|
+
# Capture original state to avoid side effects
|
|
449
|
+
orig_auto_merge = spark.conf.get(
|
|
450
|
+
"spark.databricks.delta.schema.autoMerge.enabled", "false"
|
|
451
|
+
)
|
|
452
|
+
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
|
|
453
|
+
|
|
454
|
+
if delete_condition:
|
|
455
|
+
merger = merger.whenMatchedDelete(condition=delete_condition)
|
|
456
|
+
|
|
457
|
+
merger = merger.whenMatchedUpdate(set=update_expr, condition=update_condition)
|
|
458
|
+
merger = merger.whenNotMatchedInsertAll(condition=insert_condition)
|
|
459
|
+
|
|
460
|
+
elif strategy == MergeStrategy.APPEND_ONLY:
|
|
461
|
+
merger = merger.whenNotMatchedInsertAll(condition=insert_condition)
|
|
462
|
+
|
|
463
|
+
elif strategy == MergeStrategy.DELETE_MATCH:
|
|
464
|
+
merger = merger.whenMatchedDelete(condition=delete_condition)
|
|
465
|
+
|
|
466
|
+
try:
|
|
467
|
+
merger.execute()
|
|
468
|
+
finally:
|
|
469
|
+
# Restore configuration if we changed it
|
|
470
|
+
if orig_auto_merge is not None:
|
|
471
|
+
spark.conf.set(
|
|
472
|
+
"spark.databricks.delta.schema.autoMerge.enabled", orig_auto_merge
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
else:
|
|
476
|
+
# Table does not exist
|
|
477
|
+
if strategy == MergeStrategy.DELETE_MATCH:
|
|
478
|
+
get_logging_context().warning(
|
|
479
|
+
f"Target {target} does not exist. Delete match skipped."
|
|
480
|
+
)
|
|
481
|
+
return
|
|
482
|
+
|
|
483
|
+
# Initial write
|
|
484
|
+
# If cluster_by is present, we delegate to engine.write logic?
|
|
485
|
+
# Or implement CTAS here similar to engine.write
|
|
486
|
+
|
|
487
|
+
# Build TBLPROPERTIES clause if table_properties provided
|
|
488
|
+
tbl_props_clause = ""
|
|
489
|
+
if table_properties:
|
|
490
|
+
props_str = ", ".join(f"'{k}' = '{v}'" for k, v in table_properties.items())
|
|
491
|
+
tbl_props_clause = f" TBLPROPERTIES ({props_str})"
|
|
492
|
+
|
|
493
|
+
if cluster_by:
|
|
494
|
+
# Use CTAS logic for Liquid Clustering creation
|
|
495
|
+
if isinstance(cluster_by, str):
|
|
496
|
+
cluster_cols = [cluster_by]
|
|
497
|
+
else:
|
|
498
|
+
cluster_cols = cluster_by
|
|
499
|
+
|
|
500
|
+
cols = ", ".join(f"`{c}`" for c in cluster_cols)
|
|
501
|
+
# Create temp view
|
|
502
|
+
temp_view = f"odibi_merge_init_{abs(hash(target))}"
|
|
503
|
+
batch_df.createOrReplaceTempView(temp_view)
|
|
504
|
+
|
|
505
|
+
# Determine target type (path vs table)
|
|
506
|
+
is_path = "/" in target or "\\" in target or ":" in target or target.startswith(".")
|
|
507
|
+
target_identifier = f"delta.`{target}`" if is_path else target
|
|
508
|
+
|
|
509
|
+
spark.sql(
|
|
510
|
+
f"CREATE TABLE IF NOT EXISTS {target_identifier} USING DELTA{tbl_props_clause} CLUSTER BY ({cols}) AS SELECT * FROM {temp_view}"
|
|
511
|
+
)
|
|
512
|
+
spark.catalog.dropTempView(temp_view)
|
|
513
|
+
else:
|
|
514
|
+
# Create temp view for CTAS with properties
|
|
515
|
+
temp_view = f"odibi_merge_init_{abs(hash(target))}"
|
|
516
|
+
batch_df.createOrReplaceTempView(temp_view)
|
|
517
|
+
|
|
518
|
+
is_path = "/" in target or "\\" in target or ":" in target or target.startswith(".")
|
|
519
|
+
target_identifier = f"delta.`{target}`" if is_path else target
|
|
520
|
+
|
|
521
|
+
if table_properties:
|
|
522
|
+
# Use CTAS to apply table properties
|
|
523
|
+
spark.sql(
|
|
524
|
+
f"CREATE TABLE IF NOT EXISTS {target_identifier} USING DELTA{tbl_props_clause} AS SELECT * FROM {temp_view}"
|
|
525
|
+
)
|
|
526
|
+
spark.catalog.dropTempView(temp_view)
|
|
527
|
+
else:
|
|
528
|
+
# Original path: use DataFrameWriter
|
|
529
|
+
spark.catalog.dropTempView(temp_view)
|
|
530
|
+
writer = batch_df.write.format("delta").mode("overwrite")
|
|
531
|
+
|
|
532
|
+
if is_path:
|
|
533
|
+
writer.save(target)
|
|
534
|
+
else:
|
|
535
|
+
writer.saveAsTable(target)
|
|
536
|
+
|
|
537
|
+
# --- Post-Merge Optimization ---
|
|
538
|
+
if optimize_write or zorder_by:
|
|
539
|
+
try:
|
|
540
|
+
# Identify if target is table or path
|
|
541
|
+
is_path = "/" in target or "\\" in target or ":" in target or target.startswith(".")
|
|
542
|
+
|
|
543
|
+
if is_path:
|
|
544
|
+
sql = f"OPTIMIZE delta.`{target}`"
|
|
545
|
+
else:
|
|
546
|
+
sql = f"OPTIMIZE {target}"
|
|
547
|
+
|
|
548
|
+
if zorder_by:
|
|
549
|
+
if isinstance(zorder_by, str):
|
|
550
|
+
zorder_cols = [zorder_by]
|
|
551
|
+
else:
|
|
552
|
+
zorder_cols = zorder_by
|
|
553
|
+
|
|
554
|
+
cols = ", ".join(f"`{c}`" for c in zorder_cols)
|
|
555
|
+
sql += f" ZORDER BY ({cols})"
|
|
556
|
+
|
|
557
|
+
spark.sql(sql)
|
|
558
|
+
except Exception as e:
|
|
559
|
+
get_logging_context().warning(f"Optimization failed for {target}: {e}")
|
|
560
|
+
|
|
561
|
+
if source_df.isStreaming:
|
|
562
|
+
# For streaming, wraps logic in foreachBatch
|
|
563
|
+
query = source_df.writeStream.foreachBatch(merge_batch).start()
|
|
564
|
+
return query
|
|
565
|
+
else:
|
|
566
|
+
merge_batch(source_df)
|
|
567
|
+
return source_df
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _merge_pandas(context, source_df, target, keys, strategy, audit_cols, params):
|
|
571
|
+
import pandas as pd
|
|
572
|
+
|
|
573
|
+
# Try using DuckDB for scalability if available
|
|
574
|
+
try:
|
|
575
|
+
import duckdb
|
|
576
|
+
|
|
577
|
+
HAS_DUCKDB = True
|
|
578
|
+
except ImportError:
|
|
579
|
+
HAS_DUCKDB = False
|
|
580
|
+
|
|
581
|
+
# Pandas implementation for local dev (Parquet focus)
|
|
582
|
+
path = target
|
|
583
|
+
|
|
584
|
+
# Resolve path if context has engine (EngineContext)
|
|
585
|
+
if hasattr(context, "engine") and context.engine:
|
|
586
|
+
# Try to resolve 'connection.path'
|
|
587
|
+
if "." in target:
|
|
588
|
+
parts = target.split(".", 1)
|
|
589
|
+
conn_name = parts[0]
|
|
590
|
+
rel_path = parts[1]
|
|
591
|
+
if conn_name in context.engine.connections:
|
|
592
|
+
try:
|
|
593
|
+
path = context.engine.connections[conn_name].get_path(rel_path)
|
|
594
|
+
except Exception as e:
|
|
595
|
+
get_logging_context().debug(
|
|
596
|
+
f"Could not resolve connection path: {type(e).__name__}"
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
if not ("/" in path or "\\" in path or ":" in path or path.startswith(".")):
|
|
600
|
+
# If it looks like a table name, try to treat as local path under data/
|
|
601
|
+
# or just warn.
|
|
602
|
+
# For MVP, assuming it's a path or resolved by user.
|
|
603
|
+
pass
|
|
604
|
+
|
|
605
|
+
# Audit columns
|
|
606
|
+
now = pd.Timestamp.now()
|
|
607
|
+
if audit_cols:
|
|
608
|
+
created_col = audit_cols.created_col
|
|
609
|
+
updated_col = audit_cols.updated_col
|
|
610
|
+
|
|
611
|
+
if updated_col:
|
|
612
|
+
source_df[updated_col] = now
|
|
613
|
+
if created_col and created_col not in source_df.columns:
|
|
614
|
+
source_df[created_col] = now
|
|
615
|
+
|
|
616
|
+
# Check if target exists
|
|
617
|
+
target_exists = False
|
|
618
|
+
if os.path.exists(path):
|
|
619
|
+
# Check if it's a file or directory (DuckDB handles parquet files)
|
|
620
|
+
target_exists = True
|
|
621
|
+
|
|
622
|
+
# --- DUCKDB PATH ---
|
|
623
|
+
if HAS_DUCKDB and str(path).endswith(".parquet"):
|
|
624
|
+
try:
|
|
625
|
+
con = duckdb.connect(database=":memory:")
|
|
626
|
+
|
|
627
|
+
# Register source_df
|
|
628
|
+
con.register("source_df", source_df)
|
|
629
|
+
|
|
630
|
+
if not target_exists:
|
|
631
|
+
if strategy == MergeStrategy.DELETE_MATCH:
|
|
632
|
+
return source_df # Nothing to delete from
|
|
633
|
+
|
|
634
|
+
# Initial Write
|
|
635
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
636
|
+
con.execute(f"COPY (SELECT * FROM source_df) TO '{path}' (FORMAT PARQUET)")
|
|
637
|
+
return source_df
|
|
638
|
+
|
|
639
|
+
# Construct Merge Query
|
|
640
|
+
# We need to quote columns properly? DuckDB usually handles simple names.
|
|
641
|
+
# Assuming keys are simple.
|
|
642
|
+
|
|
643
|
+
# Join condition: s.k1 = t.k1 AND s.k2 = t.k2
|
|
644
|
+
# Quote column names with double quotes for DuckDB compatibility
|
|
645
|
+
join_cond = " AND ".join([f's."{k}" = t."{k}"' for k in keys])
|
|
646
|
+
|
|
647
|
+
query = ""
|
|
648
|
+
if strategy == MergeStrategy.UPSERT:
|
|
649
|
+
# Logic: (Source) UNION ALL (Target WHERE NOT EXISTS in Source)
|
|
650
|
+
# Note: This replaces the whole row with Source version (Update)
|
|
651
|
+
# Special handling for created_col: If updating, preserve target's created_col?
|
|
652
|
+
|
|
653
|
+
# If created_col exists, we want to use Target's created_col for updates?
|
|
654
|
+
# But "Source" row has new created_col (current time) which is wrong for update.
|
|
655
|
+
# Ideally: SELECT s.* EXCEPT (created_col), t.created_col ...
|
|
656
|
+
# But 'EXCEPT' is post-projection.
|
|
657
|
+
# Simpler: Just overwrite. If user wants to preserve, they shouldn't overwrite it in source.
|
|
658
|
+
# BUT audit logic above set created_col in source.
|
|
659
|
+
# If we are strictly upserting, maybe we should handle it.
|
|
660
|
+
# For performance, let's stick to standard Upsert (Source wins).
|
|
661
|
+
|
|
662
|
+
query = f"""
|
|
663
|
+
SELECT * FROM source_df
|
|
664
|
+
UNION ALL
|
|
665
|
+
SELECT * FROM read_parquet('{path}') t
|
|
666
|
+
WHERE NOT EXISTS (
|
|
667
|
+
SELECT 1 FROM source_df s WHERE {join_cond}
|
|
668
|
+
)
|
|
669
|
+
"""
|
|
670
|
+
|
|
671
|
+
elif strategy == MergeStrategy.APPEND_ONLY:
|
|
672
|
+
# Logic: (Source WHERE NOT EXISTS in Target) UNION ALL (Target)
|
|
673
|
+
query = f"""
|
|
674
|
+
SELECT * FROM source_df s
|
|
675
|
+
WHERE NOT EXISTS (
|
|
676
|
+
SELECT 1 FROM read_parquet('{path}') t WHERE {join_cond}
|
|
677
|
+
)
|
|
678
|
+
UNION ALL
|
|
679
|
+
SELECT * FROM read_parquet('{path}')
|
|
680
|
+
"""
|
|
681
|
+
|
|
682
|
+
elif strategy == MergeStrategy.DELETE_MATCH:
|
|
683
|
+
# Logic: Target WHERE NOT EXISTS in Source
|
|
684
|
+
query = f"""
|
|
685
|
+
SELECT * FROM read_parquet('{path}') t
|
|
686
|
+
WHERE NOT EXISTS (
|
|
687
|
+
SELECT 1 FROM source_df s WHERE {join_cond}
|
|
688
|
+
)
|
|
689
|
+
"""
|
|
690
|
+
|
|
691
|
+
# Execute Atomic Write
|
|
692
|
+
# Write to temp file then rename
|
|
693
|
+
temp_path = str(path) + ".tmp.parquet"
|
|
694
|
+
con.execute(f"COPY ({query}) TO '{temp_path}' (FORMAT PARQUET)")
|
|
695
|
+
|
|
696
|
+
# Close connection before file ops
|
|
697
|
+
con.close()
|
|
698
|
+
|
|
699
|
+
# Replace
|
|
700
|
+
if os.path.exists(temp_path):
|
|
701
|
+
if os.path.exists(path):
|
|
702
|
+
os.remove(path)
|
|
703
|
+
os.rename(temp_path, path)
|
|
704
|
+
|
|
705
|
+
return source_df
|
|
706
|
+
|
|
707
|
+
except Exception as e:
|
|
708
|
+
# Fallback to Pandas if DuckDB fails (e.g. complex types, memory)
|
|
709
|
+
get_logging_context().warning(f"DuckDB merge failed, falling back to Pandas: {e}")
|
|
710
|
+
pass
|
|
711
|
+
|
|
712
|
+
# --- PANDAS FALLBACK ---
|
|
713
|
+
target_df = pd.DataFrame()
|
|
714
|
+
if os.path.exists(path):
|
|
715
|
+
try:
|
|
716
|
+
# Try reading as parquet
|
|
717
|
+
target_df = pd.read_parquet(path)
|
|
718
|
+
except Exception as e:
|
|
719
|
+
get_logging_context().debug(f"Could not read target file: {type(e).__name__}")
|
|
720
|
+
|
|
721
|
+
if target_df.empty:
|
|
722
|
+
if strategy == MergeStrategy.DELETE_MATCH:
|
|
723
|
+
return source_df
|
|
724
|
+
|
|
725
|
+
# Write source as initial
|
|
726
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
727
|
+
source_df.to_parquet(path, index=False)
|
|
728
|
+
return source_df
|
|
729
|
+
|
|
730
|
+
# Align schemas if needed (simple intersection?)
|
|
731
|
+
# For now, assuming schema matches or pandas handles it (NaNs)
|
|
732
|
+
|
|
733
|
+
# Set index for update/difference
|
|
734
|
+
# Ensure keys exist
|
|
735
|
+
for k in keys:
|
|
736
|
+
if k not in target_df.columns or k not in source_df.columns:
|
|
737
|
+
raise ValueError(
|
|
738
|
+
f"Merge key column '{k}' not found in DataFrame. "
|
|
739
|
+
f"Target columns: {list(target_df.columns)}. Source columns: {list(source_df.columns)}. "
|
|
740
|
+
f"Check your 'keys' configuration matches actual column names."
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
target_df_indexed = target_df.set_index(keys)
|
|
744
|
+
source_df_indexed = source_df.set_index(keys)
|
|
745
|
+
|
|
746
|
+
if strategy == MergeStrategy.UPSERT:
|
|
747
|
+
# Update existing
|
|
748
|
+
# NOTE: We must ensure created_col is NOT updated if it already exists
|
|
749
|
+
if audit_cols and audit_cols.created_col:
|
|
750
|
+
created_col = audit_cols.created_col
|
|
751
|
+
# Remove created_col from source update payload if present
|
|
752
|
+
cols_to_update = [c for c in source_df_indexed.columns if c != created_col]
|
|
753
|
+
target_df_indexed.update(source_df_indexed[cols_to_update])
|
|
754
|
+
else:
|
|
755
|
+
target_df_indexed.update(source_df_indexed)
|
|
756
|
+
|
|
757
|
+
# Append new
|
|
758
|
+
new_indices = source_df_indexed.index.difference(target_df_indexed.index)
|
|
759
|
+
if not new_indices.empty:
|
|
760
|
+
target_df_indexed = pd.concat([target_df_indexed, source_df_indexed.loc[new_indices]])
|
|
761
|
+
|
|
762
|
+
elif strategy == MergeStrategy.APPEND_ONLY:
|
|
763
|
+
# Only append new
|
|
764
|
+
new_indices = source_df_indexed.index.difference(target_df_indexed.index)
|
|
765
|
+
if not new_indices.empty:
|
|
766
|
+
target_df_indexed = pd.concat([target_df_indexed, source_df_indexed.loc[new_indices]])
|
|
767
|
+
|
|
768
|
+
elif strategy == MergeStrategy.DELETE_MATCH:
|
|
769
|
+
# Drop indices present in source
|
|
770
|
+
target_df_indexed = target_df_indexed.drop(source_df_indexed.index, errors="ignore")
|
|
771
|
+
|
|
772
|
+
# Reset index
|
|
773
|
+
final_df = target_df_indexed.reset_index()
|
|
774
|
+
|
|
775
|
+
# Write back
|
|
776
|
+
final_df.to_parquet(path, index=False)
|
|
777
|
+
|
|
778
|
+
return source_df
|