cloe-nessy 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/excel_reader.py +1 -1
- cloe_nessy/integration/reader/file_reader.py +2 -1
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +72 -8
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +36 -0
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/transform_regex_extract.py +169 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +34 -23
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.3.dist-info}/METADATA +2 -2
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.3.dist-info}/RECORD +10 -9
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.3.dist-info}/WHEEL +0 -0
|
@@ -192,7 +192,8 @@ class FileReader(BaseReader):
|
|
|
192
192
|
"""Add all metadata columns to the DataFrame."""
|
|
193
193
|
metadata_columns = df.select("_metadata.*").columns
|
|
194
194
|
|
|
195
|
-
|
|
195
|
+
# Cast all metadata values to strings to ensure type consistency in the map
|
|
196
|
+
entries = [(F.lit(field), F.col(f"_metadata.{field}").cast("string")) for field in metadata_columns]
|
|
196
197
|
flat_list = [item for tup in entries for item in tup]
|
|
197
198
|
|
|
198
199
|
df = df.withColumn("__metadata", F.create_map(flat_list))
|
|
@@ -29,6 +29,9 @@ class DeltaMergeConfig(BaseModel):
|
|
|
29
29
|
use_partition_pruning: Flag to specify whether to use partition
|
|
30
30
|
pruning to optimize the performance of the merge operation.
|
|
31
31
|
partition_by: List of column names to partition by.
|
|
32
|
+
column_mapping: Mapping from target column names to source column names.
|
|
33
|
+
If a column is not in the mapping, it's assumed to have the same name
|
|
34
|
+
in both source and target.
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
37
|
dataframe_columns: list[str]
|
|
@@ -39,6 +42,7 @@ class DeltaMergeConfig(BaseModel):
|
|
|
39
42
|
when_not_matched_insert: bool = True
|
|
40
43
|
use_partition_pruning: bool = True
|
|
41
44
|
partition_by: list[str] = Field(default_factory=list)
|
|
45
|
+
column_mapping: dict[str, str] = Field(default_factory=dict)
|
|
42
46
|
cols_to_merge: list[str] = Field(default_factory=list, alias="_cols_to_merge")
|
|
43
47
|
cols_to_update: set[str] = Field(default_factory=set, alias="_cols_to_update")
|
|
44
48
|
cols_to_insert: set[str] = Field(default_factory=set, alias="_cols_to_insert")
|
|
@@ -58,11 +62,20 @@ class DeltaMergeConfig(BaseModel):
|
|
|
58
62
|
@model_validator(mode="before")
|
|
59
63
|
@classmethod
|
|
60
64
|
def _validate_key_columns(cls, config: Any):
|
|
61
|
-
"""Key columns must exist in the data frame."""
|
|
65
|
+
"""Key columns must exist in the data frame (considering column mapping)."""
|
|
62
66
|
key_columns = config.get("key_columns")
|
|
63
67
|
dataframe_columns = config.get("dataframe_columns")
|
|
64
|
-
|
|
65
|
-
|
|
68
|
+
column_mapping = config.get("column_mapping", {})
|
|
69
|
+
|
|
70
|
+
# For each key column (target name), find the corresponding source column
|
|
71
|
+
missing_columns = []
|
|
72
|
+
for key_col in key_columns:
|
|
73
|
+
source_col = column_mapping.get(key_col, key_col)
|
|
74
|
+
if source_col not in dataframe_columns:
|
|
75
|
+
missing_columns.append(f"{key_col} (maps to {source_col})" if key_col != source_col else key_col)
|
|
76
|
+
|
|
77
|
+
if missing_columns:
|
|
78
|
+
raise ValueError(f"Key columns must exist in the DataFrame. Missing columns: {', '.join(missing_columns)}")
|
|
66
79
|
return config
|
|
67
80
|
|
|
68
81
|
@model_validator(mode="before")
|
|
@@ -70,15 +83,37 @@ class DeltaMergeConfig(BaseModel):
|
|
|
70
83
|
def _derive_merge_columns(cls, config: Any):
|
|
71
84
|
"""Derive update and insert columns from the DataFrame columns."""
|
|
72
85
|
dataframe_columns = config.get("dataframe_columns", [])
|
|
73
|
-
config
|
|
86
|
+
column_mapping = config.get("column_mapping", {})
|
|
87
|
+
|
|
88
|
+
# Build reverse mapping: source_col -> target_col
|
|
89
|
+
reverse_mapping = {v: k for k, v in column_mapping.items()}
|
|
90
|
+
|
|
91
|
+
# Determine which target columns we're working with
|
|
92
|
+
# For each dataframe column, find its corresponding target column
|
|
93
|
+
target_columns = []
|
|
94
|
+
for df_col in dataframe_columns:
|
|
95
|
+
target_col = reverse_mapping.get(df_col, df_col)
|
|
96
|
+
target_columns.append(target_col)
|
|
97
|
+
|
|
98
|
+
config["_cols_to_merge"] = list(set(target_columns))
|
|
99
|
+
|
|
74
100
|
if config.get("cols_to_exclude_from_update"):
|
|
75
101
|
config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
|
|
76
102
|
else:
|
|
77
103
|
config["_cols_to_update"] = set(config["_cols_to_merge"])
|
|
78
104
|
|
|
79
105
|
config["_cols_to_insert"] = config["_cols_to_merge"]
|
|
80
|
-
|
|
81
|
-
|
|
106
|
+
|
|
107
|
+
# Build final mappings using column_mapping (target -> source)
|
|
108
|
+
# For each target column, find the corresponding source column
|
|
109
|
+
config["final_cols_to_update"] = {
|
|
110
|
+
target_col: f"source.`{column_mapping.get(target_col, target_col)}`"
|
|
111
|
+
for target_col in config["_cols_to_update"]
|
|
112
|
+
}
|
|
113
|
+
config["final_cols_to_insert"] = {
|
|
114
|
+
target_col: f"source.`{column_mapping.get(target_col, target_col)}`"
|
|
115
|
+
for target_col in config["_cols_to_insert"]
|
|
116
|
+
}
|
|
82
117
|
return config
|
|
83
118
|
|
|
84
119
|
@model_validator(mode="after")
|
|
@@ -127,7 +162,7 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
127
162
|
|
|
128
163
|
def _build_match_conditions(self, data_frame: DataFrame, config: DeltaMergeConfig) -> str:
|
|
129
164
|
"""Builds match conditions for the Delta table merge."""
|
|
130
|
-
match_conditions = self.
|
|
165
|
+
match_conditions = self._merge_match_conditions_with_mapping(config.key_columns, config.column_mapping)
|
|
131
166
|
if config.use_partition_pruning:
|
|
132
167
|
match_conditions_list = [match_conditions] + [
|
|
133
168
|
self._partition_pruning_conditions(data_frame, config.partition_by),
|
|
@@ -169,6 +204,11 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
169
204
|
function also supports partition pruning to optimize the performance of
|
|
170
205
|
the merge operation.
|
|
171
206
|
|
|
207
|
+
When source and target tables have different column names, use the
|
|
208
|
+
`column_mapping` parameter to map target column names to source column names.
|
|
209
|
+
For any columns not in the mapping, the same name is assumed for both source
|
|
210
|
+
and target.
|
|
211
|
+
|
|
172
212
|
Args:
|
|
173
213
|
table: The Table object representing the Delta table.
|
|
174
214
|
table_identifier: The identifier of the Delta table in the format
|
|
@@ -178,7 +218,16 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
178
218
|
ignore_empty_df: A flag indicating whether to ignore an empty source
|
|
179
219
|
dataframe.
|
|
180
220
|
kwargs: Passed to the
|
|
181
|
-
[`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_merge_writer.DeltaMergeConfig].
|
|
221
|
+
[`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_writer.delta_merge_writer.DeltaMergeConfig].
|
|
222
|
+
Common kwargs include:
|
|
223
|
+
- key_columns: List of target column names to use as merge keys.
|
|
224
|
+
- column_mapping: Dict mapping target column names to source column names.
|
|
225
|
+
- when_matched_update: Whether to update matching records.
|
|
226
|
+
- when_matched_delete: Whether to delete matching records.
|
|
227
|
+
- when_not_matched_insert: Whether to insert non-matching records.
|
|
228
|
+
- cols_to_exclude_from_update: Target columns to exclude from updates.
|
|
229
|
+
- use_partition_pruning: Whether to use partition pruning.
|
|
230
|
+
- partition_by: List of partition columns.
|
|
182
231
|
|
|
183
232
|
Raises:
|
|
184
233
|
ValueError: If both, table and table_identifier or storage_path are provided.
|
|
@@ -189,6 +238,21 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
189
238
|
merge operation.
|
|
190
239
|
ValueError: If partition columns are not specified when using
|
|
191
240
|
partition pruning.
|
|
241
|
+
|
|
242
|
+
Example:
|
|
243
|
+
```python
|
|
244
|
+
# Merge with different column names
|
|
245
|
+
writer.write(
|
|
246
|
+
data_frame=source_df,
|
|
247
|
+
table=target_table,
|
|
248
|
+
key_columns=["customer_id"],
|
|
249
|
+
column_mapping={
|
|
250
|
+
"customer_id": "cust_id",
|
|
251
|
+
"full_name": "name",
|
|
252
|
+
"email_address": "email"
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
```
|
|
192
256
|
"""
|
|
193
257
|
if self._empty_dataframe_check(data_frame, ignore_empty_df):
|
|
194
258
|
return
|
|
@@ -150,6 +150,42 @@ class BaseDeltaWriter(BaseWriter, ABC):
|
|
|
150
150
|
"""
|
|
151
151
|
return " AND ".join([f"target.`{c}` <=> source.`{c}`" for c in columns])
|
|
152
152
|
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _merge_match_conditions_with_mapping(
|
|
155
|
+
key_columns: list[str], column_mapping: dict[str, str] | None = None
|
|
156
|
+
) -> str:
|
|
157
|
+
"""Merges match conditions with support for column name mapping.
|
|
158
|
+
|
|
159
|
+
This function generates SQL match conditions for merging tables where source and target
|
|
160
|
+
columns may have different names.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
key_columns: A list of target column names to use as keys for the merge operation.
|
|
164
|
+
column_mapping: A dictionary mapping target column names to source column names.
|
|
165
|
+
If None or empty, assumes source and target columns have the same names.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
A string containing the match conditions, separated by " AND "
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
```python
|
|
172
|
+
# Without mapping (same column names):
|
|
173
|
+
_merge_match_conditions_with_mapping(["id", "customer_id"])
|
|
174
|
+
# "target.`id` <=> source.`id` AND target.`customer_id` <=> source.`customer_id`"
|
|
175
|
+
|
|
176
|
+
# With mapping (different column names):
|
|
177
|
+
_merge_match_conditions_with_mapping(
|
|
178
|
+
["id", "customer_id"],
|
|
179
|
+
{"customer_id": "cust_id"}
|
|
180
|
+
)
|
|
181
|
+
# "target.`id` <=> source.`id` AND target.`customer_id` <=> source.`cust_id`"
|
|
182
|
+
```
|
|
183
|
+
"""
|
|
184
|
+
mapping = column_mapping or {}
|
|
185
|
+
return " AND ".join(
|
|
186
|
+
[f"target.`{target_col}` <=> source.`{mapping.get(target_col, target_col)}`" for target_col in key_columns]
|
|
187
|
+
)
|
|
188
|
+
|
|
153
189
|
@staticmethod
|
|
154
190
|
def _partition_pruning_conditions(df: "DataFrame", partition_cols: list[str] | None) -> str:
|
|
155
191
|
"""Generates partition pruning conditions for an SQL query.
|
|
@@ -19,6 +19,7 @@ from .transform_group_aggregate import TransformGroupAggregate
|
|
|
19
19
|
from .transform_hash_columns import TransformHashColumnsAction
|
|
20
20
|
from .transform_join import TransformJoinAction
|
|
21
21
|
from .transform_json_normalize import TransformJsonNormalize
|
|
22
|
+
from .transform_regex_extract import TransformRegexExtract
|
|
22
23
|
from .transform_rename_columns import TransformRenameColumnsAction
|
|
23
24
|
from .transform_replace_values import TransformReplaceValuesAction
|
|
24
25
|
from .transform_select_columns import TransformSelectColumnsAction
|
|
@@ -56,6 +57,7 @@ __all__ = [
|
|
|
56
57
|
"TransformGroupAggregate",
|
|
57
58
|
"TransformJoinAction",
|
|
58
59
|
"TransformJsonNormalize",
|
|
60
|
+
"TransformRegexExtract",
|
|
59
61
|
"TransformRenameColumnsAction",
|
|
60
62
|
"TransformReplaceValuesAction",
|
|
61
63
|
"TransformSelectColumnsAction",
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pyspark.sql.functions as F
|
|
5
|
+
|
|
6
|
+
from cloe_nessy.pipeline.pipeline_action import PipelineAction
|
|
7
|
+
from cloe_nessy.pipeline.pipeline_context import PipelineContext
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TransformRegexExtract(PipelineAction):
|
|
11
|
+
r"""Extract values from a specified column in a DataFrame using regex patterns.
|
|
12
|
+
|
|
13
|
+
This action extracts values from a column based on a regex pattern and stores
|
|
14
|
+
the result in a new column. Optionally, you can replace the matched pattern in
|
|
15
|
+
the original column with a different string, remove the original column, or add
|
|
16
|
+
a boolean column indicating which rows matched the pattern.
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
```yaml
|
|
20
|
+
Extract Action:
|
|
21
|
+
action: TRANSFORM_REGEX_EXTRACT
|
|
22
|
+
options:
|
|
23
|
+
source_column_name: Email
|
|
24
|
+
extract_column_name: org_domain
|
|
25
|
+
pattern: (?<=@)([A-Za-z0-9-]+)
|
|
26
|
+
replace_by: exampledomain.org
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
This action also supports processing multiple columns simultaneously. To use this
|
|
30
|
+
functionality, structure the configuration as a dictionary mapping each source
|
|
31
|
+
column name to its extraction parameters.
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
```yaml
|
|
35
|
+
Extract Action:
|
|
36
|
+
action: TRANSFORM_REGEX_EXTRACT
|
|
37
|
+
options:
|
|
38
|
+
extract_columns:
|
|
39
|
+
Name:
|
|
40
|
+
pattern: (?<=\w+) (\w+)
|
|
41
|
+
replace_by: ''
|
|
42
|
+
extract_column_name: last_name
|
|
43
|
+
match_info_column_name: has_last_name
|
|
44
|
+
Email:
|
|
45
|
+
pattern: @\w+\.\w+
|
|
46
|
+
extract_column_name: domain
|
|
47
|
+
keep_original_column: False
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
name: str = "TRANSFORM_REGEX_EXTRACT"
|
|
53
|
+
|
|
54
|
+
def run(
|
|
55
|
+
self,
|
|
56
|
+
context: PipelineContext,
|
|
57
|
+
source_column_name: str = "",
|
|
58
|
+
extract_column_name: str = "",
|
|
59
|
+
pattern: str = "",
|
|
60
|
+
keep_original_column: bool = True,
|
|
61
|
+
replace_by: str = "",
|
|
62
|
+
match_info_column_name: str = "",
|
|
63
|
+
extract_columns: dict | None = None,
|
|
64
|
+
**_: Any,
|
|
65
|
+
) -> PipelineContext:
|
|
66
|
+
"""Performs a regex extract (and replace) on a specified column in a DataFrame.
|
|
67
|
+
|
|
68
|
+
This function performs a regex extract (and optionally a replace) on one or more columns.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
context: The context in which this action is executed.
|
|
72
|
+
source_column_name: Column name to perform the regex replace on.
|
|
73
|
+
pattern: Regex pattern to match.
|
|
74
|
+
replace_by: String that should replace the extracted pattern in the source column.
|
|
75
|
+
extract_column_name: Column name to store the extract, default: <source_column_name>_extract
|
|
76
|
+
keep_original_column: Whether to keep the original column, default: True
|
|
77
|
+
match_info_column_name: Column name to store a boolean column whether a match was found, default: None
|
|
78
|
+
extract_columns: Dictionary of column names and their corresponding 1-column-case.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If any of the required arguments are not provided.
|
|
82
|
+
ValueError: If the regex pattern is invalid.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
PipelineContext: Transformed context with the modified DataFrame.
|
|
86
|
+
"""
|
|
87
|
+
if context.data is None:
|
|
88
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
89
|
+
if not extract_columns and not source_column_name:
|
|
90
|
+
raise ValueError("Either extract_columns or source_column_name must be provided.")
|
|
91
|
+
|
|
92
|
+
df = context.data
|
|
93
|
+
|
|
94
|
+
if source_column_name:
|
|
95
|
+
self._console_logger.info(f"Extracting from column '{source_column_name}' using pattern: {pattern}")
|
|
96
|
+
df = self._process_one_column(
|
|
97
|
+
df,
|
|
98
|
+
source_column_name,
|
|
99
|
+
pattern,
|
|
100
|
+
extract_column_name,
|
|
101
|
+
replace_by,
|
|
102
|
+
keep_original_column,
|
|
103
|
+
match_info_column_name,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
elif isinstance(extract_columns, dict):
|
|
107
|
+
self._console_logger.info(f"Extracting from {len(extract_columns)} columns")
|
|
108
|
+
for one_source_column_name in extract_columns:
|
|
109
|
+
parameter_dict = self._get_default_dict() | extract_columns[one_source_column_name]
|
|
110
|
+
df = self._process_one_column(df, one_source_column_name, **parameter_dict)
|
|
111
|
+
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError("extract_columns must be a dictionary. See documentation for proper format.")
|
|
114
|
+
|
|
115
|
+
return context.from_existing(data=df)
|
|
116
|
+
|
|
117
|
+
def _process_one_column(
|
|
118
|
+
self,
|
|
119
|
+
df,
|
|
120
|
+
source_column_name,
|
|
121
|
+
pattern,
|
|
122
|
+
extract_column_name,
|
|
123
|
+
replace_by,
|
|
124
|
+
keep_original_column,
|
|
125
|
+
match_info_column_name,
|
|
126
|
+
):
|
|
127
|
+
# Extract the first captured group (group 0 is the entire match)
|
|
128
|
+
matched_group_id = 0
|
|
129
|
+
|
|
130
|
+
if not extract_column_name:
|
|
131
|
+
extract_column_name = f"{source_column_name}_extracted"
|
|
132
|
+
|
|
133
|
+
if not pattern:
|
|
134
|
+
raise ValueError(f"The regex pattern (pattern) for column {source_column_name} must be provided.")
|
|
135
|
+
|
|
136
|
+
# Validate regex pattern
|
|
137
|
+
try:
|
|
138
|
+
re.compile(pattern)
|
|
139
|
+
except re.error as e:
|
|
140
|
+
raise ValueError(f"Invalid regex pattern '{pattern}' for column {source_column_name}: {e}") from e
|
|
141
|
+
|
|
142
|
+
df = df.withColumn(extract_column_name, F.regexp_extract(source_column_name, pattern, matched_group_id))
|
|
143
|
+
|
|
144
|
+
if replace_by:
|
|
145
|
+
df = df.withColumn(source_column_name, F.regexp_replace(source_column_name, pattern, replace_by))
|
|
146
|
+
|
|
147
|
+
if match_info_column_name:
|
|
148
|
+
# Check if extraction is null or empty string
|
|
149
|
+
df = df.withColumn(
|
|
150
|
+
match_info_column_name,
|
|
151
|
+
F.when((F.col(extract_column_name).isNull()) | (F.col(extract_column_name) == ""), False).otherwise(
|
|
152
|
+
True
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if not keep_original_column:
|
|
157
|
+
df = df.drop(source_column_name)
|
|
158
|
+
|
|
159
|
+
return df
|
|
160
|
+
|
|
161
|
+
def _get_default_dict(self) -> dict[str, Any]:
|
|
162
|
+
"""Return default parameters for single column extraction."""
|
|
163
|
+
return {
|
|
164
|
+
"pattern": "",
|
|
165
|
+
"extract_column_name": "",
|
|
166
|
+
"replace_by": "",
|
|
167
|
+
"keep_original_column": True,
|
|
168
|
+
"match_info_column_name": "",
|
|
169
|
+
}
|
|
@@ -13,6 +13,7 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
13
13
|
|
|
14
14
|
Example:
|
|
15
15
|
```yaml
|
|
16
|
+
# Basic merge with same column names
|
|
16
17
|
Write Delta Merge:
|
|
17
18
|
action: WRITE_DELTA_MERGE
|
|
18
19
|
options:
|
|
@@ -20,13 +21,25 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
20
21
|
key_columns:
|
|
21
22
|
- id
|
|
22
23
|
- customer_id
|
|
23
|
-
|
|
24
|
-
-
|
|
25
|
-
- email
|
|
26
|
-
- updated_at
|
|
24
|
+
cols_to_exclude_from_update:
|
|
25
|
+
- created_at
|
|
27
26
|
when_matched_update: true
|
|
28
27
|
when_not_matched_insert: true
|
|
29
28
|
use_partition_pruning: true
|
|
29
|
+
|
|
30
|
+
# Merge with different source and target column names
|
|
31
|
+
Write Delta Merge with Mapping:
|
|
32
|
+
action: WRITE_DELTA_MERGE
|
|
33
|
+
options:
|
|
34
|
+
table_identifier: my_catalog.my_schema.my_table
|
|
35
|
+
key_columns:
|
|
36
|
+
- customer_id
|
|
37
|
+
column_mapping:
|
|
38
|
+
customer_id: cust_id
|
|
39
|
+
full_name: name
|
|
40
|
+
email_address: email
|
|
41
|
+
when_matched_update: true
|
|
42
|
+
when_not_matched_insert: true
|
|
30
43
|
```
|
|
31
44
|
"""
|
|
32
45
|
|
|
@@ -38,11 +51,10 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
38
51
|
*,
|
|
39
52
|
table_identifier: str | None = None,
|
|
40
53
|
key_columns: list[str] | None = None,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
cols_to_exclude: list[str] | None = None,
|
|
54
|
+
cols_to_exclude_from_update: list[str] | None = None,
|
|
55
|
+
column_mapping: dict[str, str] | None = None,
|
|
44
56
|
when_matched_update: bool = True,
|
|
45
|
-
|
|
57
|
+
when_matched_delete: bool = False,
|
|
46
58
|
when_not_matched_insert: bool = True,
|
|
47
59
|
use_partition_pruning: bool = True,
|
|
48
60
|
ignore_empty_df: bool = False,
|
|
@@ -57,23 +69,23 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
57
69
|
table_identifier: The identifier of the table. If passed, the
|
|
58
70
|
UC Adapter will be used to create a table object. Otherwise the Table
|
|
59
71
|
object will be created from the table metadata in the context.
|
|
60
|
-
key_columns: List of column names that form the
|
|
72
|
+
key_columns: List of target column names that form the
|
|
61
73
|
key for the merge operation.
|
|
74
|
+
cols_to_exclude_from_update: List of target column names to be
|
|
75
|
+
excluded from the update operation in the target Delta table.
|
|
76
|
+
column_mapping: Mapping from target column names to source column names.
|
|
77
|
+
Use this when source and target tables have different column names.
|
|
78
|
+
If a column is not in the mapping, it's assumed to have the same name
|
|
79
|
+
in both source and target.
|
|
62
80
|
when_matched_update: Flag to specify whether to
|
|
63
|
-
perform an update operation
|
|
81
|
+
perform an update operation when matching records are found in
|
|
64
82
|
the target Delta table.
|
|
65
|
-
|
|
83
|
+
when_matched_delete: Flag to specify whether to
|
|
66
84
|
perform a delete operation when matching records are found in
|
|
67
85
|
the target Delta table.
|
|
68
86
|
when_not_matched_insert: Flag to specify whether to perform an
|
|
69
87
|
insert operation when matching records are not found in the target
|
|
70
88
|
Delta table.
|
|
71
|
-
cols_to_update: List of column names to be
|
|
72
|
-
updated in the target Delta table.
|
|
73
|
-
cols_to_insert: List of column names to be
|
|
74
|
-
inserted into the target Delta table.
|
|
75
|
-
cols_to_exclude: List of column names to be
|
|
76
|
-
excluded from the merge operation.
|
|
77
89
|
use_partition_pruning: Flag to specify whether to use partition
|
|
78
90
|
pruning to optimize the performance of the merge operation.
|
|
79
91
|
ignore_empty_df: A flag indicating whether to ignore an empty source dataframe.
|
|
@@ -113,16 +125,15 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
113
125
|
assert key_columns is not None, "Key columns must be provided."
|
|
114
126
|
|
|
115
127
|
delta_merge_writer.write(
|
|
116
|
-
|
|
128
|
+
data_frame=context.data,
|
|
117
129
|
table=context.table_metadata,
|
|
130
|
+
table_identifier=context.table_metadata.identifier,
|
|
118
131
|
storage_path=str(context.table_metadata.storage_path),
|
|
119
|
-
data_frame=context.data,
|
|
120
132
|
key_columns=key_columns,
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
cols_to_exclude=cols_to_exclude,
|
|
133
|
+
cols_to_exclude_from_update=cols_to_exclude_from_update or [],
|
|
134
|
+
column_mapping=column_mapping or {},
|
|
124
135
|
when_matched_update=when_matched_update,
|
|
125
|
-
|
|
136
|
+
when_matched_delete=when_matched_delete,
|
|
126
137
|
when_not_matched_insert=when_not_matched_insert,
|
|
127
138
|
use_partition_pruning=use_partition_pruning,
|
|
128
139
|
partition_by=context.table_metadata.partition_by,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Project-URL: homepage, https://initions.com/
|
|
6
6
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
@@ -12,7 +12,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Topic :: Database
|
|
15
|
-
Requires-Python: <3.
|
|
15
|
+
Requires-Python: <3.14,>=3.11
|
|
16
16
|
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
17
17
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
18
|
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
@@ -30,9 +30,9 @@ cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py,sha256=
|
|
|
30
30
|
cloe_nessy/integration/reader/__init__.py,sha256=NWQx-v6aKE8YOHhsxfeaZnMVq4KLKyRWXzUduf5aVsk,265
|
|
31
31
|
cloe_nessy/integration/reader/api_reader.py,sha256=FbOyfLVG1ryL2GC-MgE1uClHICsQKBj9yZbY4TG5qrk,19637
|
|
32
32
|
cloe_nessy/integration/reader/catalog_reader.py,sha256=DlnykmFjV_v8SCBh3qaCvf24QM-6TdMFVHx5Mqv7Nvs,4850
|
|
33
|
-
cloe_nessy/integration/reader/excel_reader.py,sha256=
|
|
33
|
+
cloe_nessy/integration/reader/excel_reader.py,sha256=QXm0MaE_-tW5ix-f_3Pgn-Vx7VG5jA_uSp858rVV7lA,8042
|
|
34
34
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
35
|
-
cloe_nessy/integration/reader/file_reader.py,sha256=
|
|
35
|
+
cloe_nessy/integration/reader/file_reader.py,sha256=FFqqu1h003FY2Df3ru-G1JO4Bg2Ai8Rzh58fjOCN7NM,8262
|
|
36
36
|
cloe_nessy/integration/reader/reader.py,sha256=YHriYkzsBduBjfI2FnP03VEo15a8UCRZ_sXtre8eaEs,1041
|
|
37
37
|
cloe_nessy/integration/writer/__init__.py,sha256=3yzCAGiWZdQWtsbzlTih01sxVTJV2DDYwvl34lEAUlE,243
|
|
38
38
|
cloe_nessy/integration/writer/catalog_writer.py,sha256=dQeXmtfs7J6rP6Ye3OCvxBraFScFX_3SHs7Md58hEeM,5296
|
|
@@ -40,9 +40,9 @@ cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70T
|
|
|
40
40
|
cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
|
|
41
41
|
cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
|
|
42
42
|
cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=nribgHmapp59v3Rw_AfJg0_BRYhP7x2IJIeE74Ia_6A,4748
|
|
43
|
-
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=
|
|
43
|
+
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=aMpWa8GcnW9xu5eGE_AsVyfkL5hRIeJwfCLPniM8lak,13170
|
|
44
44
|
cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=m4YFY9_WgaOcnpBviVt3Km-w3wf3NF25wPS-n0NBGcE,970
|
|
45
|
-
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=
|
|
45
|
+
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=B7PwPHKrsJL0ZxBT-H9wWSy0gn7shqNDJ0AbrpMHyMg,10135
|
|
46
46
|
cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
|
|
47
47
|
cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
|
|
48
48
|
cloe_nessy/logging/logger_mixin.py,sha256=H8MyMEyb_kEDP0Ow5QStAFLuOkTIeUnneGaj916fKlU,7443
|
|
@@ -74,7 +74,7 @@ cloe_nessy/pipeline/pipeline_context.py,sha256=eCOcjyE16rGRom3L85Gy_BbncfQD6i1x3
|
|
|
74
74
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=eeC4RbGBILGN6zkbUyjH-qGgEMtOWV4Kv_VxrHbHMY0,9021
|
|
75
75
|
cloe_nessy/pipeline/pipeline_plotting_service.py,sha256=goMQj73FzUVchKn5c2SsPcWR6fr7DtVkVrcQfJsKCq4,13111
|
|
76
76
|
cloe_nessy/pipeline/pipeline_step.py,sha256=oTnlvRpB0fbOBQXbPe1URstA5fv-97igCHt_41fKCAk,2082
|
|
77
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
77
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=FfAnSIl-0T6pnaWhClkDqV8nfTdvLvZZJdwycsZMLPw,2990
|
|
78
78
|
cloe_nessy/pipeline/actions/read_api.py,sha256=MAc7QfmhnaRUMdE09Ywt41RSAsuW4co8zF0zXHwbM8U,16193
|
|
79
79
|
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=sx3dezd33c1FawMrxORwhK5GNo1IpjCyuLATWz7esZ0,6735
|
|
80
80
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=IG_VmDEt1TvGVEO0SY9Fm3awHNjfisR1_7DUmhC3NEE,7968
|
|
@@ -93,6 +93,7 @@ cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpb
|
|
|
93
93
|
cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=M5_wolJwzJpPTSrZq4yWV3TH7H6BGqbjJkJCwtqPlQo,8507
|
|
94
94
|
cloe_nessy/pipeline/actions/transform_join.py,sha256=ez1M1wVc9khOZj1swMArJbBKXxEpjenUHrW1wL8H330,7200
|
|
95
95
|
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
|
|
96
|
+
cloe_nessy/pipeline/actions/transform_regex_extract.py,sha256=vMtUW0s_oXy8DC1-4Xh-WQN3CCp8jXYsJiFYvGdYrqE,6390
|
|
96
97
|
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
|
|
97
98
|
cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
|
|
98
99
|
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
@@ -100,7 +101,7 @@ cloe_nessy/pipeline/actions/transform_union.py,sha256=SZtEzh567CIExUj9yMEgshE28h
|
|
|
100
101
|
cloe_nessy/pipeline/actions/transform_with_column.py,sha256=c-E1yYkeYmovbN1maT7ImpdQlW0nYvYsHCtDvfe4wt8,3357
|
|
101
102
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=FyC0scQU8Ul3Uigpk6IN2IJpf_4jRjAqF5yHtDVwG00,4852
|
|
102
103
|
cloe_nessy/pipeline/actions/write_delta_append.py,sha256=e1g4mDhwAZdKyt4Gb7ZzHcQrJ1duSl8qOn6ONizRsoM,2934
|
|
103
|
-
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=
|
|
104
|
+
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=kZL2PTIwB6Mj4UKg5f9SvU1VaakuYfFoymlcLf-L7dA,6443
|
|
104
105
|
cloe_nessy/pipeline/actions/write_file.py,sha256=JZ8UZslxUn_ttYt5wDyvtHFq2FqYk3vOR8kvExJI8pk,3212
|
|
105
106
|
cloe_nessy/pipeline/utils/__init__.py,sha256=xi02UjBMiXWD7b9gDvww4gyRyowb0eRd_6Wbu0F_cro,118
|
|
106
107
|
cloe_nessy/pipeline/utils/delta_load_utils.py,sha256=KitMNruxePEkecI0h4Jint1JwJpaEog5mCOchMkgan8,1495
|
|
@@ -112,6 +113,6 @@ cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_Up
|
|
|
112
113
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
114
|
cloe_nessy/utils/column_names.py,sha256=dCNtm61mc5aLkY2oE4rlfN3VLCrpot6fOESjAZmCmhA,361
|
|
114
115
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
115
|
-
cloe_nessy-1.0.
|
|
116
|
-
cloe_nessy-1.0.
|
|
117
|
-
cloe_nessy-1.0.
|
|
116
|
+
cloe_nessy-1.0.3.dist-info/METADATA,sha256=fqBGuiBnOft_b6Q3yS_hxFPi5pqduBX7V7bBeXYwkvQ,3291
|
|
117
|
+
cloe_nessy-1.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
118
|
+
cloe_nessy-1.0.3.dist-info/RECORD,,
|
|
File without changes
|