cloe-nessy 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +72 -8
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +36 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +34 -23
- {cloe_nessy-1.0.1.dist-info → cloe_nessy-1.0.3.dist-info}/METADATA +1 -1
- {cloe_nessy-1.0.1.dist-info → cloe_nessy-1.0.3.dist-info}/RECORD +6 -6
- {cloe_nessy-1.0.1.dist-info → cloe_nessy-1.0.3.dist-info}/WHEEL +0 -0
|
@@ -29,6 +29,9 @@ class DeltaMergeConfig(BaseModel):
|
|
|
29
29
|
use_partition_pruning: Flag to specify whether to use partition
|
|
30
30
|
pruning to optimize the performance of the merge operation.
|
|
31
31
|
partition_by: List of column names to partition by.
|
|
32
|
+
column_mapping: Mapping from target column names to source column names.
|
|
33
|
+
If a column is not in the mapping, it's assumed to have the same name
|
|
34
|
+
in both source and target.
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
37
|
dataframe_columns: list[str]
|
|
@@ -39,6 +42,7 @@ class DeltaMergeConfig(BaseModel):
|
|
|
39
42
|
when_not_matched_insert: bool = True
|
|
40
43
|
use_partition_pruning: bool = True
|
|
41
44
|
partition_by: list[str] = Field(default_factory=list)
|
|
45
|
+
column_mapping: dict[str, str] = Field(default_factory=dict)
|
|
42
46
|
cols_to_merge: list[str] = Field(default_factory=list, alias="_cols_to_merge")
|
|
43
47
|
cols_to_update: set[str] = Field(default_factory=set, alias="_cols_to_update")
|
|
44
48
|
cols_to_insert: set[str] = Field(default_factory=set, alias="_cols_to_insert")
|
|
@@ -58,11 +62,20 @@ class DeltaMergeConfig(BaseModel):
|
|
|
58
62
|
@model_validator(mode="before")
|
|
59
63
|
@classmethod
|
|
60
64
|
def _validate_key_columns(cls, config: Any):
|
|
61
|
-
"""Key columns must exist in the data frame."""
|
|
65
|
+
"""Key columns must exist in the data frame (considering column mapping)."""
|
|
62
66
|
key_columns = config.get("key_columns")
|
|
63
67
|
dataframe_columns = config.get("dataframe_columns")
|
|
64
|
-
|
|
65
|
-
|
|
68
|
+
column_mapping = config.get("column_mapping", {})
|
|
69
|
+
|
|
70
|
+
# For each key column (target name), find the corresponding source column
|
|
71
|
+
missing_columns = []
|
|
72
|
+
for key_col in key_columns:
|
|
73
|
+
source_col = column_mapping.get(key_col, key_col)
|
|
74
|
+
if source_col not in dataframe_columns:
|
|
75
|
+
missing_columns.append(f"{key_col} (maps to {source_col})" if key_col != source_col else key_col)
|
|
76
|
+
|
|
77
|
+
if missing_columns:
|
|
78
|
+
raise ValueError(f"Key columns must exist in the DataFrame. Missing columns: {', '.join(missing_columns)}")
|
|
66
79
|
return config
|
|
67
80
|
|
|
68
81
|
@model_validator(mode="before")
|
|
@@ -70,15 +83,37 @@ class DeltaMergeConfig(BaseModel):
|
|
|
70
83
|
def _derive_merge_columns(cls, config: Any):
|
|
71
84
|
"""Derive update and insert columns from the DataFrame columns."""
|
|
72
85
|
dataframe_columns = config.get("dataframe_columns", [])
|
|
73
|
-
config
|
|
86
|
+
column_mapping = config.get("column_mapping", {})
|
|
87
|
+
|
|
88
|
+
# Build reverse mapping: source_col -> target_col
|
|
89
|
+
reverse_mapping = {v: k for k, v in column_mapping.items()}
|
|
90
|
+
|
|
91
|
+
# Determine which target columns we're working with
|
|
92
|
+
# For each dataframe column, find its corresponding target column
|
|
93
|
+
target_columns = []
|
|
94
|
+
for df_col in dataframe_columns:
|
|
95
|
+
target_col = reverse_mapping.get(df_col, df_col)
|
|
96
|
+
target_columns.append(target_col)
|
|
97
|
+
|
|
98
|
+
config["_cols_to_merge"] = list(set(target_columns))
|
|
99
|
+
|
|
74
100
|
if config.get("cols_to_exclude_from_update"):
|
|
75
101
|
config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
|
|
76
102
|
else:
|
|
77
103
|
config["_cols_to_update"] = set(config["_cols_to_merge"])
|
|
78
104
|
|
|
79
105
|
config["_cols_to_insert"] = config["_cols_to_merge"]
|
|
80
|
-
|
|
81
|
-
|
|
106
|
+
|
|
107
|
+
# Build final mappings using column_mapping (target -> source)
|
|
108
|
+
# For each target column, find the corresponding source column
|
|
109
|
+
config["final_cols_to_update"] = {
|
|
110
|
+
target_col: f"source.`{column_mapping.get(target_col, target_col)}`"
|
|
111
|
+
for target_col in config["_cols_to_update"]
|
|
112
|
+
}
|
|
113
|
+
config["final_cols_to_insert"] = {
|
|
114
|
+
target_col: f"source.`{column_mapping.get(target_col, target_col)}`"
|
|
115
|
+
for target_col in config["_cols_to_insert"]
|
|
116
|
+
}
|
|
82
117
|
return config
|
|
83
118
|
|
|
84
119
|
@model_validator(mode="after")
|
|
@@ -127,7 +162,7 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
127
162
|
|
|
128
163
|
def _build_match_conditions(self, data_frame: DataFrame, config: DeltaMergeConfig) -> str:
|
|
129
164
|
"""Builds match conditions for the Delta table merge."""
|
|
130
|
-
match_conditions = self.
|
|
165
|
+
match_conditions = self._merge_match_conditions_with_mapping(config.key_columns, config.column_mapping)
|
|
131
166
|
if config.use_partition_pruning:
|
|
132
167
|
match_conditions_list = [match_conditions] + [
|
|
133
168
|
self._partition_pruning_conditions(data_frame, config.partition_by),
|
|
@@ -169,6 +204,11 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
169
204
|
function also supports partition pruning to optimize the performance of
|
|
170
205
|
the merge operation.
|
|
171
206
|
|
|
207
|
+
When source and target tables have different column names, use the
|
|
208
|
+
`column_mapping` parameter to map target column names to source column names.
|
|
209
|
+
For any columns not in the mapping, the same name is assumed for both source
|
|
210
|
+
and target.
|
|
211
|
+
|
|
172
212
|
Args:
|
|
173
213
|
table: The Table object representing the Delta table.
|
|
174
214
|
table_identifier: The identifier of the Delta table in the format
|
|
@@ -178,7 +218,16 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
178
218
|
ignore_empty_df: A flag indicating whether to ignore an empty source
|
|
179
219
|
dataframe.
|
|
180
220
|
kwargs: Passed to the
|
|
181
|
-
[`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_merge_writer.DeltaMergeConfig].
|
|
221
|
+
[`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_writer.delta_merge_writer.DeltaMergeConfig].
|
|
222
|
+
Common kwargs include:
|
|
223
|
+
- key_columns: List of target column names to use as merge keys.
|
|
224
|
+
- column_mapping: Dict mapping target column names to source column names.
|
|
225
|
+
- when_matched_update: Whether to update matching records.
|
|
226
|
+
- when_matched_delete: Whether to delete matching records.
|
|
227
|
+
- when_not_matched_insert: Whether to insert non-matching records.
|
|
228
|
+
- cols_to_exclude_from_update: Target columns to exclude from updates.
|
|
229
|
+
- use_partition_pruning: Whether to use partition pruning.
|
|
230
|
+
- partition_by: List of partition columns.
|
|
182
231
|
|
|
183
232
|
Raises:
|
|
184
233
|
ValueError: If both, table and table_identifier or storage_path are provided.
|
|
@@ -189,6 +238,21 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
189
238
|
merge operation.
|
|
190
239
|
ValueError: If partition columns are not specified when using
|
|
191
240
|
partition pruning.
|
|
241
|
+
|
|
242
|
+
Example:
|
|
243
|
+
```python
|
|
244
|
+
# Merge with different column names
|
|
245
|
+
writer.write(
|
|
246
|
+
data_frame=source_df,
|
|
247
|
+
table=target_table,
|
|
248
|
+
key_columns=["customer_id"],
|
|
249
|
+
column_mapping={
|
|
250
|
+
"customer_id": "cust_id",
|
|
251
|
+
"full_name": "name",
|
|
252
|
+
"email_address": "email"
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
```
|
|
192
256
|
"""
|
|
193
257
|
if self._empty_dataframe_check(data_frame, ignore_empty_df):
|
|
194
258
|
return
|
|
@@ -150,6 +150,42 @@ class BaseDeltaWriter(BaseWriter, ABC):
|
|
|
150
150
|
"""
|
|
151
151
|
return " AND ".join([f"target.`{c}` <=> source.`{c}`" for c in columns])
|
|
152
152
|
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _merge_match_conditions_with_mapping(
|
|
155
|
+
key_columns: list[str], column_mapping: dict[str, str] | None = None
|
|
156
|
+
) -> str:
|
|
157
|
+
"""Merges match conditions with support for column name mapping.
|
|
158
|
+
|
|
159
|
+
This function generates SQL match conditions for merging tables where source and target
|
|
160
|
+
columns may have different names.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
key_columns: A list of target column names to use as keys for the merge operation.
|
|
164
|
+
column_mapping: A dictionary mapping target column names to source column names.
|
|
165
|
+
If None or empty, assumes source and target columns have the same names.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
A string containing the match conditions, separated by " AND "
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
```python
|
|
172
|
+
# Without mapping (same column names):
|
|
173
|
+
_merge_match_conditions_with_mapping(["id", "customer_id"])
|
|
174
|
+
# "target.`id` <=> source.`id` AND target.`customer_id` <=> source.`customer_id`"
|
|
175
|
+
|
|
176
|
+
# With mapping (different column names):
|
|
177
|
+
_merge_match_conditions_with_mapping(
|
|
178
|
+
["id", "customer_id"],
|
|
179
|
+
{"customer_id": "cust_id"}
|
|
180
|
+
)
|
|
181
|
+
# "target.`id` <=> source.`id` AND target.`customer_id` <=> source.`cust_id`"
|
|
182
|
+
```
|
|
183
|
+
"""
|
|
184
|
+
mapping = column_mapping or {}
|
|
185
|
+
return " AND ".join(
|
|
186
|
+
[f"target.`{target_col}` <=> source.`{mapping.get(target_col, target_col)}`" for target_col in key_columns]
|
|
187
|
+
)
|
|
188
|
+
|
|
153
189
|
@staticmethod
|
|
154
190
|
def _partition_pruning_conditions(df: "DataFrame", partition_cols: list[str] | None) -> str:
|
|
155
191
|
"""Generates partition pruning conditions for an SQL query.
|
|
@@ -13,6 +13,7 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
13
13
|
|
|
14
14
|
Example:
|
|
15
15
|
```yaml
|
|
16
|
+
# Basic merge with same column names
|
|
16
17
|
Write Delta Merge:
|
|
17
18
|
action: WRITE_DELTA_MERGE
|
|
18
19
|
options:
|
|
@@ -20,13 +21,25 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
20
21
|
key_columns:
|
|
21
22
|
- id
|
|
22
23
|
- customer_id
|
|
23
|
-
|
|
24
|
-
-
|
|
25
|
-
- email
|
|
26
|
-
- updated_at
|
|
24
|
+
cols_to_exclude_from_update:
|
|
25
|
+
- created_at
|
|
27
26
|
when_matched_update: true
|
|
28
27
|
when_not_matched_insert: true
|
|
29
28
|
use_partition_pruning: true
|
|
29
|
+
|
|
30
|
+
# Merge with different source and target column names
|
|
31
|
+
Write Delta Merge with Mapping:
|
|
32
|
+
action: WRITE_DELTA_MERGE
|
|
33
|
+
options:
|
|
34
|
+
table_identifier: my_catalog.my_schema.my_table
|
|
35
|
+
key_columns:
|
|
36
|
+
- customer_id
|
|
37
|
+
column_mapping:
|
|
38
|
+
customer_id: cust_id
|
|
39
|
+
full_name: name
|
|
40
|
+
email_address: email
|
|
41
|
+
when_matched_update: true
|
|
42
|
+
when_not_matched_insert: true
|
|
30
43
|
```
|
|
31
44
|
"""
|
|
32
45
|
|
|
@@ -38,11 +51,10 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
38
51
|
*,
|
|
39
52
|
table_identifier: str | None = None,
|
|
40
53
|
key_columns: list[str] | None = None,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
cols_to_exclude: list[str] | None = None,
|
|
54
|
+
cols_to_exclude_from_update: list[str] | None = None,
|
|
55
|
+
column_mapping: dict[str, str] | None = None,
|
|
44
56
|
when_matched_update: bool = True,
|
|
45
|
-
|
|
57
|
+
when_matched_delete: bool = False,
|
|
46
58
|
when_not_matched_insert: bool = True,
|
|
47
59
|
use_partition_pruning: bool = True,
|
|
48
60
|
ignore_empty_df: bool = False,
|
|
@@ -57,23 +69,23 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
57
69
|
table_identifier: The identifier of the table. If passed, the
|
|
58
70
|
UC Adapter will be used to create a table object. Otherwise the Table
|
|
59
71
|
object will be created from the table metadata in the context.
|
|
60
|
-
key_columns: List of column names that form the
|
|
72
|
+
key_columns: List of target column names that form the
|
|
61
73
|
key for the merge operation.
|
|
74
|
+
cols_to_exclude_from_update: List of target column names to be
|
|
75
|
+
excluded from the update operation in the target Delta table.
|
|
76
|
+
column_mapping: Mapping from target column names to source column names.
|
|
77
|
+
Use this when source and target tables have different column names.
|
|
78
|
+
If a column is not in the mapping, it's assumed to have the same name
|
|
79
|
+
in both source and target.
|
|
62
80
|
when_matched_update: Flag to specify whether to
|
|
63
|
-
perform an update operation
|
|
81
|
+
perform an update operation when matching records are found in
|
|
64
82
|
the target Delta table.
|
|
65
|
-
|
|
83
|
+
when_matched_delete: Flag to specify whether to
|
|
66
84
|
perform a delete operation when matching records are found in
|
|
67
85
|
the target Delta table.
|
|
68
86
|
when_not_matched_insert: Flag to specify whether to perform an
|
|
69
87
|
insert operation when matching records are not found in the target
|
|
70
88
|
Delta table.
|
|
71
|
-
cols_to_update: List of column names to be
|
|
72
|
-
updated in the target Delta table.
|
|
73
|
-
cols_to_insert: List of column names to be
|
|
74
|
-
inserted into the target Delta table.
|
|
75
|
-
cols_to_exclude: List of column names to be
|
|
76
|
-
excluded from the merge operation.
|
|
77
89
|
use_partition_pruning: Flag to specify whether to use partition
|
|
78
90
|
pruning to optimize the performance of the merge operation.
|
|
79
91
|
ignore_empty_df: A flag indicating whether to ignore an empty source dataframe.
|
|
@@ -113,16 +125,15 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
113
125
|
assert key_columns is not None, "Key columns must be provided."
|
|
114
126
|
|
|
115
127
|
delta_merge_writer.write(
|
|
116
|
-
|
|
128
|
+
data_frame=context.data,
|
|
117
129
|
table=context.table_metadata,
|
|
130
|
+
table_identifier=context.table_metadata.identifier,
|
|
118
131
|
storage_path=str(context.table_metadata.storage_path),
|
|
119
|
-
data_frame=context.data,
|
|
120
132
|
key_columns=key_columns,
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
cols_to_exclude=cols_to_exclude,
|
|
133
|
+
cols_to_exclude_from_update=cols_to_exclude_from_update or [],
|
|
134
|
+
column_mapping=column_mapping or {},
|
|
124
135
|
when_matched_update=when_matched_update,
|
|
125
|
-
|
|
136
|
+
when_matched_delete=when_matched_delete,
|
|
126
137
|
when_not_matched_insert=when_not_matched_insert,
|
|
127
138
|
use_partition_pruning=use_partition_pruning,
|
|
128
139
|
partition_by=context.table_metadata.partition_by,
|
|
@@ -40,9 +40,9 @@ cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70T
|
|
|
40
40
|
cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
|
|
41
41
|
cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
|
|
42
42
|
cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=nribgHmapp59v3Rw_AfJg0_BRYhP7x2IJIeE74Ia_6A,4748
|
|
43
|
-
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=
|
|
43
|
+
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=aMpWa8GcnW9xu5eGE_AsVyfkL5hRIeJwfCLPniM8lak,13170
|
|
44
44
|
cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=m4YFY9_WgaOcnpBviVt3Km-w3wf3NF25wPS-n0NBGcE,970
|
|
45
|
-
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=
|
|
45
|
+
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=B7PwPHKrsJL0ZxBT-H9wWSy0gn7shqNDJ0AbrpMHyMg,10135
|
|
46
46
|
cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
|
|
47
47
|
cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
|
|
48
48
|
cloe_nessy/logging/logger_mixin.py,sha256=H8MyMEyb_kEDP0Ow5QStAFLuOkTIeUnneGaj916fKlU,7443
|
|
@@ -101,7 +101,7 @@ cloe_nessy/pipeline/actions/transform_union.py,sha256=SZtEzh567CIExUj9yMEgshE28h
|
|
|
101
101
|
cloe_nessy/pipeline/actions/transform_with_column.py,sha256=c-E1yYkeYmovbN1maT7ImpdQlW0nYvYsHCtDvfe4wt8,3357
|
|
102
102
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=FyC0scQU8Ul3Uigpk6IN2IJpf_4jRjAqF5yHtDVwG00,4852
|
|
103
103
|
cloe_nessy/pipeline/actions/write_delta_append.py,sha256=e1g4mDhwAZdKyt4Gb7ZzHcQrJ1duSl8qOn6ONizRsoM,2934
|
|
104
|
-
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=
|
|
104
|
+
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=kZL2PTIwB6Mj4UKg5f9SvU1VaakuYfFoymlcLf-L7dA,6443
|
|
105
105
|
cloe_nessy/pipeline/actions/write_file.py,sha256=JZ8UZslxUn_ttYt5wDyvtHFq2FqYk3vOR8kvExJI8pk,3212
|
|
106
106
|
cloe_nessy/pipeline/utils/__init__.py,sha256=xi02UjBMiXWD7b9gDvww4gyRyowb0eRd_6Wbu0F_cro,118
|
|
107
107
|
cloe_nessy/pipeline/utils/delta_load_utils.py,sha256=KitMNruxePEkecI0h4Jint1JwJpaEog5mCOchMkgan8,1495
|
|
@@ -113,6 +113,6 @@ cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_Up
|
|
|
113
113
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
114
|
cloe_nessy/utils/column_names.py,sha256=dCNtm61mc5aLkY2oE4rlfN3VLCrpot6fOESjAZmCmhA,361
|
|
115
115
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
116
|
-
cloe_nessy-1.0.
|
|
117
|
-
cloe_nessy-1.0.
|
|
118
|
-
cloe_nessy-1.0.
|
|
116
|
+
cloe_nessy-1.0.3.dist-info/METADATA,sha256=fqBGuiBnOft_b6Q3yS_hxFPi5pqduBX7V7bBeXYwkvQ,3291
|
|
117
|
+
cloe_nessy-1.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
118
|
+
cloe_nessy-1.0.3.dist-info/RECORD,,
|
|
File without changes
|