cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/delta_loader/__init__.py +14 -0
- cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
- cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
- cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
- cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
- cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
- cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
- cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
- cloe_nessy/integration/reader/catalog_reader.py +33 -6
- cloe_nessy/integration/reader/file_reader.py +23 -0
- cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
- cloe_nessy/logging/logger_mixin.py +0 -1
- cloe_nessy/models/column.py +1 -1
- cloe_nessy/models/table.py +4 -3
- cloe_nessy/object_manager/table_manager.py +3 -1
- cloe_nessy/pipeline/actions/__init__.py +4 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
- cloe_nessy/pipeline/actions/read_files.py +45 -3
- cloe_nessy/pipeline/actions/transform_convert_timestamp.py +97 -0
- cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
- cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
- cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
- cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
- cloe_nessy/pipeline/actions/write_file.py +6 -1
- cloe_nessy/pipeline/utils/__init__.py +5 -0
- cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
- cloe_nessy/utils/column_names.py +9 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/METADATA +3 -3
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/RECORD +32 -20
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/WHEEL +0 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/top_level.txt +0 -0
cloe_nessy/models/table.py
CHANGED
|
@@ -2,7 +2,6 @@ from pathlib import Path
|
|
|
2
2
|
from typing import Any, Self
|
|
3
3
|
|
|
4
4
|
import yaml
|
|
5
|
-
import yaml.scanner
|
|
6
5
|
from jinja2 import TemplateNotFound
|
|
7
6
|
from pydantic import (
|
|
8
7
|
Field,
|
|
@@ -11,6 +10,8 @@ from pydantic import (
|
|
|
11
10
|
field_validator,
|
|
12
11
|
model_validator,
|
|
13
12
|
)
|
|
13
|
+
from yaml.parser import ParserError
|
|
14
|
+
from yaml.scanner import ScannerError
|
|
14
15
|
|
|
15
16
|
from ..logging import LoggerMixin
|
|
16
17
|
from ..utils.file_and_directory_handler import process_path
|
|
@@ -225,8 +226,8 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
|
225
226
|
errors += sub_errors
|
|
226
227
|
except (
|
|
227
228
|
ValidationError,
|
|
228
|
-
|
|
229
|
-
|
|
229
|
+
ParserError,
|
|
230
|
+
ScannerError,
|
|
230
231
|
) as e:
|
|
231
232
|
instance = None
|
|
232
233
|
errors.append(e)
|
|
@@ -198,7 +198,9 @@ class TableManager(LoggerMixin):
|
|
|
198
198
|
ValueError: If neither table nor location is provided, or if both are provided.
|
|
199
199
|
"""
|
|
200
200
|
if (table is None and location is None) or (table is not None and location is not None):
|
|
201
|
-
raise ValueError(
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
|
|
203
|
+
)
|
|
202
204
|
|
|
203
205
|
if table is not None:
|
|
204
206
|
location = str(table.storage_path)
|
|
@@ -9,7 +9,9 @@ from .read_metadata_yaml import ReadMetadataYAMLAction
|
|
|
9
9
|
from .transform_change_datatype import TransformChangeDatatypeAction
|
|
10
10
|
from .transform_clean_column_names import TransformCleanColumnNamesAction
|
|
11
11
|
from .transform_concat_columns import TransformConcatColumnsAction
|
|
12
|
+
from .transform_convert_timestamp import TransformConvertTimestampAction
|
|
12
13
|
from .transform_decode import TransformDecodeAction
|
|
14
|
+
from .transform_deduplication import TransformDeduplication
|
|
13
15
|
from .transform_distinct import TransformDistinctAction
|
|
14
16
|
from .transform_filter import TransformFilterAction
|
|
15
17
|
from .transform_generic_sql import TransformSqlAction
|
|
@@ -45,7 +47,9 @@ __all__ = [
|
|
|
45
47
|
"TransformChangeDatatypeAction",
|
|
46
48
|
"TransformCleanColumnNamesAction",
|
|
47
49
|
"TransformConcatColumnsAction",
|
|
50
|
+
"TransformConvertTimestampAction",
|
|
48
51
|
"TransformDecodeAction",
|
|
52
|
+
"TransformDeduplication",
|
|
49
53
|
"TransformDistinctAction",
|
|
50
54
|
"TransformSqlAction",
|
|
51
55
|
"TransformGroupAggregate",
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import DeltaLoadOptions
|
|
3
4
|
from ...integration.reader import CatalogReader
|
|
4
5
|
from ..pipeline_action import PipelineAction
|
|
5
6
|
from ..pipeline_context import PipelineContext
|
|
7
|
+
from ..utils import set_delta_load_info
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class ReadCatalogTableAction(PipelineAction):
|
|
@@ -21,6 +23,12 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
21
23
|
options:
|
|
22
24
|
table_identifier: my_catalog.business_schema.sales_table
|
|
23
25
|
options: <options for the CatalogReader read method>
|
|
26
|
+
delta_load_options:
|
|
27
|
+
strategy: CDF
|
|
28
|
+
delta_load_identifier: my_delta_load_id
|
|
29
|
+
strategy_options:
|
|
30
|
+
deduplication_columns: ["id"]
|
|
31
|
+
enable_full_load: true
|
|
24
32
|
```
|
|
25
33
|
"""
|
|
26
34
|
|
|
@@ -32,6 +40,7 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
32
40
|
*,
|
|
33
41
|
table_identifier: str | None = None,
|
|
34
42
|
options: dict[str, str] | None = None,
|
|
43
|
+
delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
|
|
35
44
|
**_: Any, # define kwargs to match the base class signature
|
|
36
45
|
) -> PipelineContext:
|
|
37
46
|
"""Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
|
|
@@ -45,6 +54,8 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
45
54
|
options: A dictionary of options for customizing
|
|
46
55
|
the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
|
|
47
56
|
behavior, such as filters or reading modes. Defaults to None.
|
|
57
|
+
delta_load_options: Options for delta loading, if applicable.
|
|
58
|
+
Configures the [`DeltaLoader`][cloe_nessy.integration.delta_loader].
|
|
48
59
|
|
|
49
60
|
Raises:
|
|
50
61
|
ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
|
|
@@ -53,13 +64,35 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
53
64
|
An updated pipeline context containing the data read from the catalog table as a DataFrame.
|
|
54
65
|
"""
|
|
55
66
|
if not options:
|
|
56
|
-
options =
|
|
67
|
+
options = {}
|
|
68
|
+
|
|
69
|
+
if not delta_load_options:
|
|
70
|
+
delta_load_options = {}
|
|
57
71
|
|
|
58
72
|
if (table_metadata := context.table_metadata) and table_identifier is None:
|
|
59
73
|
table_identifier = table_metadata.identifier
|
|
60
74
|
if table_identifier is None:
|
|
61
75
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
62
76
|
|
|
77
|
+
if isinstance(delta_load_options, dict):
|
|
78
|
+
delta_options_dict = delta_load_options
|
|
79
|
+
if delta_load_options:
|
|
80
|
+
delta_load_options = DeltaLoadOptions(**delta_load_options)
|
|
81
|
+
else:
|
|
82
|
+
delta_load_options = None
|
|
83
|
+
else:
|
|
84
|
+
delta_options_dict = delta_load_options.model_dump() if delta_load_options else {}
|
|
85
|
+
|
|
86
|
+
runtime_info = set_delta_load_info(
|
|
87
|
+
table_identifier=table_identifier,
|
|
88
|
+
delta_load_options=delta_options_dict,
|
|
89
|
+
runtime_info=context.runtime_info or {},
|
|
90
|
+
)
|
|
91
|
+
|
|
63
92
|
table_reader = CatalogReader()
|
|
64
|
-
df = table_reader.read(
|
|
65
|
-
|
|
93
|
+
df = table_reader.read(
|
|
94
|
+
table_identifier=table_identifier,
|
|
95
|
+
options=options,
|
|
96
|
+
delta_load_options=delta_load_options,
|
|
97
|
+
)
|
|
98
|
+
return context.from_existing(data=df, runtime_info=runtime_info)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import DeltaLoadOptions
|
|
3
4
|
from ...integration.reader import FileReader
|
|
4
5
|
from ..pipeline_action import PipelineAction
|
|
5
6
|
from ..pipeline_context import PipelineContext
|
|
7
|
+
from ..utils import set_delta_load_info
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class ReadFilesAction(PipelineAction):
|
|
@@ -55,6 +57,24 @@ class ReadFilesAction(PipelineAction):
|
|
|
55
57
|
Use the `extension` option to specify the extension of the files
|
|
56
58
|
to read. Additionally, use the `spark_format` option to specify
|
|
57
59
|
the format with which to read the files.
|
|
60
|
+
|
|
61
|
+
=== "Read Delta Lake table with delta loading"
|
|
62
|
+
```yaml
|
|
63
|
+
Read Delta Files:
|
|
64
|
+
action: READ_FILES
|
|
65
|
+
options:
|
|
66
|
+
location: /path/to/delta/table
|
|
67
|
+
spark_format: delta
|
|
68
|
+
delta_load_options:
|
|
69
|
+
strategy: CDF
|
|
70
|
+
delta_load_identifier: my_delta_files_load
|
|
71
|
+
strategy_options:
|
|
72
|
+
deduplication_columns: ["id"]
|
|
73
|
+
enable_full_load: false
|
|
74
|
+
```
|
|
75
|
+
!!! note "Delta Loading for Files"
|
|
76
|
+
Use `delta_load_options` when reading Delta Lake tables to enable
|
|
77
|
+
incremental loading. This works with both CDF and timestamp strategies.
|
|
58
78
|
"""
|
|
59
79
|
|
|
60
80
|
name: str = "READ_FILES"
|
|
@@ -70,6 +90,7 @@ class ReadFilesAction(PipelineAction):
|
|
|
70
90
|
schema: str | None = None,
|
|
71
91
|
add_metadata_column: bool = True,
|
|
72
92
|
options: dict[str, str] | None = None,
|
|
93
|
+
delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
|
|
73
94
|
**_: Any,
|
|
74
95
|
) -> PipelineContext:
|
|
75
96
|
"""Reads files from a specified location.
|
|
@@ -87,6 +108,8 @@ class ReadFilesAction(PipelineAction):
|
|
|
87
108
|
add_metadata_column: Whether to include the `__metadata` column with
|
|
88
109
|
file metadata in the DataFrame.
|
|
89
110
|
options: Additional options passed to the reader.
|
|
111
|
+
delta_load_options: Options for delta loading, if applicable. When provided
|
|
112
|
+
for Delta format files, enables incremental loading using delta loader strategies.
|
|
90
113
|
|
|
91
114
|
Raises:
|
|
92
115
|
ValueError: If neither `extension` nor `spark_format` are provided, or if
|
|
@@ -105,6 +128,25 @@ class ReadFilesAction(PipelineAction):
|
|
|
105
128
|
if (metadata := context.table_metadata) and schema is None:
|
|
106
129
|
schema = metadata.schema
|
|
107
130
|
|
|
131
|
+
# Convert dict to DeltaLoadOptions if needed
|
|
132
|
+
if isinstance(delta_load_options, dict):
|
|
133
|
+
delta_load_options = DeltaLoadOptions(**delta_load_options)
|
|
134
|
+
|
|
135
|
+
# Set up runtime info for delta loading
|
|
136
|
+
runtime_info = context.runtime_info or {}
|
|
137
|
+
if delta_load_options:
|
|
138
|
+
# Convert DeltaLoadOptions to dict for runtime info storage
|
|
139
|
+
delta_options_dict = (
|
|
140
|
+
delta_load_options.model_dump()
|
|
141
|
+
if isinstance(delta_load_options, DeltaLoadOptions)
|
|
142
|
+
else delta_load_options
|
|
143
|
+
)
|
|
144
|
+
runtime_info = set_delta_load_info(
|
|
145
|
+
table_identifier=location, # Use location as identifier for file-based delta loading
|
|
146
|
+
delta_load_options=delta_options_dict,
|
|
147
|
+
runtime_info=runtime_info,
|
|
148
|
+
)
|
|
149
|
+
|
|
108
150
|
file_reader = FileReader()
|
|
109
151
|
df = file_reader.read(
|
|
110
152
|
location=location,
|
|
@@ -114,11 +156,11 @@ class ReadFilesAction(PipelineAction):
|
|
|
114
156
|
search_subdirs=search_subdirs,
|
|
115
157
|
options=options,
|
|
116
158
|
add_metadata_column=add_metadata_column,
|
|
159
|
+
delta_load_options=delta_load_options,
|
|
117
160
|
)
|
|
118
161
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if add_metadata_column:
|
|
162
|
+
# Only process metadata column if it exists and wasn't using delta loading
|
|
163
|
+
if add_metadata_column and "__metadata" in df.columns:
|
|
122
164
|
read_files_list = [x.file_path for x in df.select("__metadata.file_path").drop_duplicates().collect()]
|
|
123
165
|
if runtime_info is None:
|
|
124
166
|
runtime_info = {"read_files": read_files_list}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.errors.exceptions.connect import IllegalArgumentException
|
|
4
|
+
from pyspark.sql import functions as F
|
|
5
|
+
from pyspark.sql.utils import AnalysisException
|
|
6
|
+
|
|
7
|
+
from ..pipeline_action import PipelineAction
|
|
8
|
+
from ..pipeline_context import PipelineContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TransformConvertTimestampAction(PipelineAction):
|
|
12
|
+
"""This action performs timestamp based conversions.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Convert Timestamp:
|
|
17
|
+
action: TRANSFORM_CONVERT_TIMESTAMP
|
|
18
|
+
options:
|
|
19
|
+
columns:
|
|
20
|
+
- date
|
|
21
|
+
- creation_timestamp
|
|
22
|
+
- current_ts
|
|
23
|
+
source_format: unixtime_ms
|
|
24
|
+
target_format: timestamp
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "TRANSFORM_CONVERT_TIMESTAMP"
|
|
29
|
+
|
|
30
|
+
def run(
|
|
31
|
+
self,
|
|
32
|
+
context: PipelineContext,
|
|
33
|
+
*,
|
|
34
|
+
columns: list[str] | str | None = None,
|
|
35
|
+
source_format: str = "",
|
|
36
|
+
target_format: str = "",
|
|
37
|
+
**_: Any,
|
|
38
|
+
) -> PipelineContext:
|
|
39
|
+
"""Converts column(s) from a given source format to a new format.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
context: Context in which this Action is executed.
|
|
43
|
+
columns: A column name or a list of column names that should be converted.
|
|
44
|
+
source_format: Initial format type of the column.
|
|
45
|
+
target_format: Desired format type of the column.
|
|
46
|
+
This also supports passing a format string like `yyyy-MM-dd HH:mm:ss`.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If no column, source_format or target_format are provided.
|
|
50
|
+
ValueError: If source_format or target_format are not supported.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
PipelineContext: Context after the execution of this Action.
|
|
54
|
+
"""
|
|
55
|
+
if not columns:
|
|
56
|
+
raise ValueError("No column names provided.")
|
|
57
|
+
if not source_format:
|
|
58
|
+
raise ValueError("No source_format provided.")
|
|
59
|
+
if not target_format:
|
|
60
|
+
raise ValueError("No target_format provided.")
|
|
61
|
+
if context.data is None:
|
|
62
|
+
raise ValueError("Context DataFrame is required.")
|
|
63
|
+
df = context.data
|
|
64
|
+
|
|
65
|
+
columns = [columns] if isinstance(columns, str) else columns
|
|
66
|
+
|
|
67
|
+
match source_format:
|
|
68
|
+
# convert always to timestamp first
|
|
69
|
+
case "string" | "date" | "unixtime":
|
|
70
|
+
for column in columns:
|
|
71
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column)))
|
|
72
|
+
case "unixtime_ms":
|
|
73
|
+
for column in columns:
|
|
74
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
|
|
75
|
+
case "timestamp":
|
|
76
|
+
pass
|
|
77
|
+
case _:
|
|
78
|
+
raise ValueError(f"Unknown source_format {source_format}")
|
|
79
|
+
|
|
80
|
+
match target_format:
|
|
81
|
+
# convert from timestamp to desired output type and format
|
|
82
|
+
case "timestamp":
|
|
83
|
+
pass
|
|
84
|
+
case "unixtime":
|
|
85
|
+
for column in columns:
|
|
86
|
+
df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
|
|
87
|
+
case "date":
|
|
88
|
+
for column in columns:
|
|
89
|
+
df = df.withColumn(column, F.to_date(F.col(column)))
|
|
90
|
+
case _:
|
|
91
|
+
try:
|
|
92
|
+
for column in columns:
|
|
93
|
+
df = df.withColumn(column, F.date_format(F.col(column), target_format))
|
|
94
|
+
except (IllegalArgumentException, AnalysisException) as e:
|
|
95
|
+
raise ValueError(f"Invalid target_format {target_format}") from e
|
|
96
|
+
|
|
97
|
+
return context.from_existing(data=df)
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
from typing import Any
|
|
4
2
|
|
|
5
3
|
import pyspark.sql.functions as F
|
|
6
4
|
import pyspark.sql.types as T
|
|
7
5
|
from pyspark.sql import Window
|
|
8
6
|
|
|
7
|
+
from ...utils.column_names import generate_unique_column_name
|
|
9
8
|
from ..pipeline_action import PipelineAction
|
|
10
9
|
from ..pipeline_context import PipelineContext
|
|
11
10
|
|
|
@@ -105,18 +104,14 @@ class TransformDeduplication(PipelineAction):
|
|
|
105
104
|
else:
|
|
106
105
|
order_by_list = [F.col(col_name).asc() for col_name in order_by_columns]
|
|
107
106
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
return "".join(random.choice(string.ascii_uppercase) for _ in range(length))
|
|
107
|
+
window_specification = (
|
|
108
|
+
Window.partitionBy(key_columns)
|
|
109
|
+
.orderBy(order_by_list)
|
|
110
|
+
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
|
111
|
+
)
|
|
114
112
|
|
|
115
|
-
row_number_col_name =
|
|
116
|
-
while row_number_col_name in context.data.columns:
|
|
117
|
-
row_number_col_name = generate_random_string(20)
|
|
113
|
+
row_number_col_name = generate_unique_column_name(existing_columns=set(context.data.columns), prefix="row_num")
|
|
118
114
|
|
|
119
|
-
# drop the duplicates
|
|
120
115
|
df = (
|
|
121
116
|
context.data.withColumn(row_number_col_name, F.row_number().over(window_specification))
|
|
122
117
|
.filter(F.col(row_number_col_name) == 1)
|
|
@@ -132,13 +132,13 @@ class TransformHashColumnsAction(PipelineAction):
|
|
|
132
132
|
action: TRANSFORM_HASH_COLUMNS
|
|
133
133
|
options:
|
|
134
134
|
hash_config:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
135
|
+
hashed_column1:
|
|
136
|
+
columns: ["column1", "column2"]
|
|
137
|
+
algorithm: "sha2"
|
|
138
|
+
bits: 224
|
|
139
|
+
hashed_column2:
|
|
140
|
+
columns: ["column1"]
|
|
141
|
+
algorithm: "crc32"
|
|
142
142
|
```
|
|
143
143
|
|
|
144
144
|
Given a DataFrame `df` with the following structure:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import consume_delta_load
|
|
3
4
|
from ...integration.writer import CatalogWriter
|
|
4
5
|
from ..pipeline_action import PipelineAction
|
|
5
6
|
from ..pipeline_context import PipelineContext
|
|
@@ -63,6 +64,10 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
63
64
|
if table_identifier is None:
|
|
64
65
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
65
66
|
|
|
67
|
+
runtime_info = getattr(context, "runtime_info", None)
|
|
68
|
+
if runtime_info and runtime_info.get("is_delta_load"):
|
|
69
|
+
consume_delta_load(runtime_info)
|
|
70
|
+
|
|
66
71
|
writer = CatalogWriter()
|
|
67
72
|
writer.write_table(
|
|
68
73
|
df=context.data, # type: ignore
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import consume_delta_load
|
|
3
4
|
from ...integration.writer import DeltaAppendWriter
|
|
4
5
|
from ...models.adapter import UnityCatalogAdapter
|
|
5
6
|
from ...pipeline import PipelineAction, PipelineContext
|
|
@@ -10,6 +11,15 @@ class WriteDeltaAppendAction(PipelineAction):
|
|
|
10
11
|
|
|
11
12
|
The WriteDeltaAppendAction appends a Dataframe to Delta Table.
|
|
12
13
|
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Write Delta Append:
|
|
17
|
+
action: WRITE_DELTA_APPEND
|
|
18
|
+
options:
|
|
19
|
+
table_identifier: my_catalog.my_schema.my_table
|
|
20
|
+
ignore_empty_df: false
|
|
21
|
+
```
|
|
22
|
+
|
|
13
23
|
Returns:
|
|
14
24
|
None.
|
|
15
25
|
"""
|
|
@@ -66,4 +76,9 @@ class WriteDeltaAppendAction(PipelineAction):
|
|
|
66
76
|
ignore_empty_df=ignore_empty_df,
|
|
67
77
|
options=options,
|
|
68
78
|
)
|
|
79
|
+
|
|
80
|
+
runtime_info = getattr(context, "runtime_info", None)
|
|
81
|
+
if runtime_info and runtime_info.get("is_delta_load"):
|
|
82
|
+
consume_delta_load(runtime_info)
|
|
83
|
+
|
|
69
84
|
return context.from_existing()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import consume_delta_load
|
|
3
4
|
from ...integration.writer import DeltaMergeWriter
|
|
4
5
|
from ...models.adapter import UnityCatalogAdapter
|
|
5
6
|
from ...pipeline import PipelineAction, PipelineContext
|
|
@@ -10,6 +11,24 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
10
11
|
|
|
11
12
|
The MergeIntoDeltaAction merges a Dataframe to Delta Table.
|
|
12
13
|
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Write Delta Merge:
|
|
17
|
+
action: WRITE_DELTA_MERGE
|
|
18
|
+
options:
|
|
19
|
+
table_identifier: my_catalog.my_schema.my_table
|
|
20
|
+
key_columns:
|
|
21
|
+
- id
|
|
22
|
+
- customer_id
|
|
23
|
+
cols_to_update:
|
|
24
|
+
- name
|
|
25
|
+
- email
|
|
26
|
+
- updated_at
|
|
27
|
+
when_matched_update: true
|
|
28
|
+
when_not_matched_insert: true
|
|
29
|
+
use_partition_pruning: true
|
|
30
|
+
```
|
|
31
|
+
|
|
13
32
|
Returns:
|
|
14
33
|
None.
|
|
15
34
|
"""
|
|
@@ -112,6 +131,10 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
112
131
|
ignore_empty_df=ignore_empty_df,
|
|
113
132
|
)
|
|
114
133
|
|
|
134
|
+
runtime_info = getattr(context, "runtime_info", None)
|
|
135
|
+
if runtime_info and runtime_info.get("is_delta_load"):
|
|
136
|
+
consume_delta_load(runtime_info)
|
|
137
|
+
|
|
115
138
|
if refresh_table:
|
|
116
139
|
delta_merge_writer.table_manager.refresh_table(table_identifier=context.table_metadata.identifier)
|
|
117
140
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from ...integration.delta_loader import consume_delta_load
|
|
3
4
|
from ...integration.writer import FileWriter
|
|
4
5
|
from ...pipeline import PipelineAction, PipelineContext
|
|
5
6
|
|
|
@@ -21,7 +22,7 @@ class WriteFileAction(PipelineAction):
|
|
|
21
22
|
mode: "append"
|
|
22
23
|
is_stream: False
|
|
23
24
|
options:
|
|
24
|
-
mergeSchema:
|
|
25
|
+
mergeSchema: true
|
|
25
26
|
```
|
|
26
27
|
"""
|
|
27
28
|
|
|
@@ -91,4 +92,8 @@ class WriteFileAction(PipelineAction):
|
|
|
91
92
|
options=options,
|
|
92
93
|
)
|
|
93
94
|
|
|
95
|
+
runtime_info = getattr(context, "runtime_info", None)
|
|
96
|
+
if runtime_info and runtime_info.get("is_delta_load"):
|
|
97
|
+
consume_delta_load(runtime_info)
|
|
98
|
+
|
|
94
99
|
return context.from_existing()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Utilities for managing delta load information in pipeline runtime context."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def set_delta_load_info(
|
|
7
|
+
table_identifier: str,
|
|
8
|
+
delta_load_options: dict[str, Any],
|
|
9
|
+
runtime_info: dict[str, Any],
|
|
10
|
+
) -> dict[str, Any]:
|
|
11
|
+
"""Update the runtime information dictionary with delta load options for a specific table.
|
|
12
|
+
|
|
13
|
+
If delta load options are provided, this function marks the runtime as a delta load and
|
|
14
|
+
stores the options under the given table identifier within the 'delta_load_options' key
|
|
15
|
+
of the runtime_info dictionary.
|
|
16
|
+
|
|
17
|
+
The method uses `setdefault("delta_load_options", {})` to ensure that the 'delta_load_options'
|
|
18
|
+
key exists in the runtime_info dictionary. If the key is not present, it initializes it with
|
|
19
|
+
an empty dictionary. This prevents overwriting existing delta load options and allows
|
|
20
|
+
multiple tables' options to be stored without losing previous entries.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
table_identifier: The identifier for the table (can be table name or file path).
|
|
24
|
+
delta_load_options: Options specific to the delta load for the table.
|
|
25
|
+
runtime_info: The runtime information dictionary to update.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
The updated runtime information dictionary with delta load details.
|
|
29
|
+
"""
|
|
30
|
+
if not delta_load_options:
|
|
31
|
+
return runtime_info
|
|
32
|
+
|
|
33
|
+
runtime_info["is_delta_load"] = True
|
|
34
|
+
runtime_info.setdefault("delta_load_options", {})[table_identifier] = delta_load_options
|
|
35
|
+
|
|
36
|
+
return runtime_info
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_unique_column_name(existing_columns: set[str], prefix: str = "temp_col") -> str:
|
|
5
|
+
"""Generate a unique column name that doesn't conflict with existing columns."""
|
|
6
|
+
base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
|
|
7
|
+
while base_name in existing_columns:
|
|
8
|
+
base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
|
|
9
|
+
return base_name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.17.0
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
6
6
|
License: MIT
|
|
@@ -58,12 +58,12 @@ Extract-Transform-Load (ETL) Workflow.
|
|
|
58
58
|
|
|
59
59
|
When you are contributing, please refer to our Contribution Guide in the *nessy*
|
|
60
60
|
Docs
|
|
61
|
-
[here](https://
|
|
61
|
+
[here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
|
|
62
62
|
|
|
63
63
|
## Usage
|
|
64
64
|
|
|
65
65
|
Please find the User Guide
|
|
66
|
-
[here](https://
|
|
66
|
+
[here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
|
|
67
67
|
|
|
68
68
|
## Contact
|
|
69
69
|
|