cloe-nessy 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/api_reader.py +4 -2
- cloe_nessy/integration/reader/catalog_reader.py +6 -3
- cloe_nessy/integration/reader/excel_reader.py +1 -1
- cloe_nessy/integration/reader/file_reader.py +78 -5
- cloe_nessy/integration/writer/__init__.py +8 -1
- cloe_nessy/integration/writer/delta_writer/__init__.py +7 -0
- cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +108 -0
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +215 -0
- cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +21 -0
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +210 -0
- cloe_nessy/integration/writer/delta_writer/exceptions.py +4 -0
- cloe_nessy/integration/writer/file_writer.py +132 -0
- cloe_nessy/integration/writer/writer.py +54 -0
- cloe_nessy/models/adapter/unity_catalog_adapter.py +5 -1
- cloe_nessy/models/schema.py +1 -1
- cloe_nessy/models/table.py +32 -10
- cloe_nessy/models/volume.py +13 -4
- cloe_nessy/object_manager/table_manager.py +73 -19
- cloe_nessy/pipeline/actions/__init__.py +7 -1
- cloe_nessy/pipeline/actions/read_catalog_table.py +1 -4
- cloe_nessy/pipeline/actions/write_delta_append.py +69 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +118 -0
- cloe_nessy/pipeline/actions/write_file.py +94 -0
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.10.dist-info}/METADATA +28 -4
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.10.dist-info}/RECORD +27 -16
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.10.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -48,9 +48,8 @@ def table_log_decorator(operation: str):
|
|
|
48
48
|
def inner_decorator(func):
|
|
49
49
|
@functools.wraps(func)
|
|
50
50
|
def wrapper(self, *args, **kwargs):
|
|
51
|
-
table_identifier = kwargs.get("table_identifier") or kwargs.get("table")
|
|
52
|
-
if
|
|
53
|
-
# assume its a Table object
|
|
51
|
+
table_identifier = kwargs.get("table_identifier") or kwargs.get("table") or args[0]
|
|
52
|
+
if isinstance(table_identifier, Table):
|
|
54
53
|
table_identifier = table_identifier.identifier
|
|
55
54
|
self._tabular_logger.info(
|
|
56
55
|
"operation:%s | identifier:%s | status:start | error:''",
|
|
@@ -84,7 +83,6 @@ class TableManager(LoggerMixin):
|
|
|
84
83
|
|
|
85
84
|
def __init__(self, tabular_logger: logging.Logger | None = None):
|
|
86
85
|
self._spark = SessionManager.get_spark_session()
|
|
87
|
-
self._utils = SessionManager.get_utils()
|
|
88
86
|
self._console_logger = self.get_console_logger()
|
|
89
87
|
self._console_logger.debug("TableManager initialized...")
|
|
90
88
|
self._tabular_logger = tabular_logger or self.get_tabular_logger(**TableManagerLogs().__dict__)
|
|
@@ -115,51 +113,83 @@ class TableManager(LoggerMixin):
|
|
|
115
113
|
if statement and statement != "\n":
|
|
116
114
|
self._spark.sql(statement)
|
|
117
115
|
|
|
118
|
-
def drop_table(
|
|
116
|
+
def drop_table(
|
|
117
|
+
self,
|
|
118
|
+
table: Table | None = None,
|
|
119
|
+
storage_location: str | None = None,
|
|
120
|
+
table_identifier: str | None = None,
|
|
121
|
+
delete_physical_data: bool = False,
|
|
122
|
+
):
|
|
119
123
|
"""Deletes a Table. For security reasons you are forced to pass the table_name.
|
|
120
124
|
|
|
121
125
|
If delete_physical_data is True the actual physical data on the ADLS will be deleted.
|
|
122
126
|
Use with caution!
|
|
123
127
|
|
|
124
128
|
Args:
|
|
129
|
+
table: The Table object representing the Delta table.
|
|
130
|
+
storage_location: The location of the Delta table on the ADLS.
|
|
125
131
|
table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
|
|
126
132
|
delete_physical_data: If set to True, deletes not only the metadata
|
|
127
133
|
within the Catalog but also the physical data.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
ValueError: If neither table nor table_identifier is provided, or if both are provided.
|
|
137
|
+
ValueError: If the table storage path is not provided by the table object.
|
|
128
138
|
"""
|
|
129
139
|
self._console_logger.info(f"Deleting table [ '{table_identifier}' ] ...")
|
|
130
|
-
if not
|
|
131
|
-
raise
|
|
132
|
-
|
|
140
|
+
if table is not None and (table_identifier is not None or storage_location is not None):
|
|
141
|
+
raise ValueError("Either table or table_identifier and storage_location must be provided, but not both.")
|
|
142
|
+
if table is not None:
|
|
143
|
+
table_identifier = table.identifier
|
|
144
|
+
storage_location = str(table.storage_path)
|
|
133
145
|
if delete_physical_data:
|
|
134
|
-
self._delete_physical_data()
|
|
135
|
-
self.drop_table_from_catalog(table_identifier)
|
|
146
|
+
self._delete_physical_data(location=storage_location)
|
|
147
|
+
self.drop_table_from_catalog(table_identifier=table_identifier)
|
|
136
148
|
|
|
137
|
-
def drop_table_from_catalog(self, table_identifier: str) -> None:
|
|
149
|
+
def drop_table_from_catalog(self, table_identifier: str | None = None, table: Table | None = None) -> None:
|
|
138
150
|
"""Removes a table from the catalog. Physical data is retained.
|
|
139
151
|
|
|
140
152
|
Args:
|
|
141
153
|
table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
|
|
154
|
+
table: The Table object representing the Delta table.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
ValueError: If neither table nor table_identifier is provided, or if both are provided.
|
|
142
158
|
"""
|
|
159
|
+
if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
|
|
160
|
+
raise ValueError("Either table or table_identifier must be provided, but not both.")
|
|
161
|
+
if table is not None:
|
|
162
|
+
table_identifier = table.identifier
|
|
143
163
|
self._console_logger.info(f"... deleting table [ '{table_identifier}' ] from Catalog.")
|
|
144
|
-
if not isinstance(table_identifier, str):
|
|
145
|
-
raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
|
|
146
164
|
self._spark.sql(f"DROP TABLE IF EXISTS {table_identifier};")
|
|
147
165
|
|
|
148
|
-
def _delete_physical_data(self):
|
|
166
|
+
def _delete_physical_data(self, table: Table | None = None, location: str | None = None):
|
|
149
167
|
"""Removes the physical data on the ADLS for the location of this table.
|
|
150
168
|
|
|
169
|
+
Args:
|
|
170
|
+
table: The Table object representing the Delta table to be deleted.
|
|
171
|
+
location: The location of the Delta table to be deleted.
|
|
172
|
+
|
|
151
173
|
Raises:
|
|
152
|
-
|
|
174
|
+
ValueError: If neither table nor location is provided, or if both are provided.
|
|
175
|
+
ValueError: If the table storage path is not provided by the table object.
|
|
153
176
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
177
|
+
if (table is None and location is None) or (table is not None and location is not None):
|
|
178
|
+
raise ValueError("Either table or location must be provided, but not both.")
|
|
179
|
+
if table is not None:
|
|
180
|
+
if table.storage_path is None:
|
|
181
|
+
raise ValueError("Table storage path must be provided.")
|
|
182
|
+
location = str(table.storage_path)
|
|
183
|
+
SessionManager.get_utils().fs.rm(location, recurse=True)
|
|
184
|
+
self._console_logger.info("... deleting physical data.")
|
|
156
185
|
|
|
157
|
-
def get_delta_table(self, table: Table | None = None, location: str | None = None) -> DeltaTable:
|
|
186
|
+
def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
|
|
158
187
|
"""Get the DeltaTable object from the Table objects location or a location string.
|
|
159
188
|
|
|
160
189
|
Args:
|
|
161
190
|
table: A Table object representing the Delta table.
|
|
162
191
|
location: A string representing the table location.
|
|
192
|
+
spark: An optional Spark session. If not provided, the current Spark session will be used.
|
|
163
193
|
|
|
164
194
|
Returns:
|
|
165
195
|
The DeltaTable object corresponding to the given Table object or location string.
|
|
@@ -173,7 +203,7 @@ class TableManager(LoggerMixin):
|
|
|
173
203
|
if table is not None:
|
|
174
204
|
location = str(table.storage_path)
|
|
175
205
|
self._console_logger.info(f"Getting DeltaTable object for location: {location}")
|
|
176
|
-
return DeltaTable.forPath(self._spark, str(location))
|
|
206
|
+
return DeltaTable.forPath(spark or self._spark, str(location))
|
|
177
207
|
|
|
178
208
|
def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
|
|
179
209
|
"""Checks if a table exists in the catalog.
|
|
@@ -232,3 +262,27 @@ class TableManager(LoggerMixin):
|
|
|
232
262
|
|
|
233
263
|
self._console_logger.info(f"Refreshing table: {table_identifier}")
|
|
234
264
|
self._spark.sql(f"REFRESH TABLE {table_identifier};")
|
|
265
|
+
|
|
266
|
+
@table_log_decorator(operation="truncate")
|
|
267
|
+
def truncate_table(
|
|
268
|
+
self,
|
|
269
|
+
table: Table | None = None,
|
|
270
|
+
table_identifier: str | None = None,
|
|
271
|
+
):
|
|
272
|
+
"""Truncates a table.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
table: A Table object representing the Delta table.
|
|
276
|
+
table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
ValueError: If neither table nor table_identifier is provided, or if both are provided.
|
|
280
|
+
"""
|
|
281
|
+
if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
|
|
282
|
+
raise ValueError("Either table or table_identifier must be provided, but not both.")
|
|
283
|
+
|
|
284
|
+
if table is not None:
|
|
285
|
+
table_identifier = table.escaped_identifier
|
|
286
|
+
|
|
287
|
+
self._console_logger.info(f"Truncating table: {table_identifier}")
|
|
288
|
+
self._spark.sql(f"TRUNCATE TABLE {table_identifier};")
|
|
@@ -22,6 +22,9 @@ from .transform_replace_values import TransformReplaceValuesAction
|
|
|
22
22
|
from .transform_select_columns import TransformSelectColumnsAction
|
|
23
23
|
from .transform_union import TransformUnionAction
|
|
24
24
|
from .write_catalog_table import WriteCatalogTableAction
|
|
25
|
+
from .write_delta_append import WriteDeltaAppendAction
|
|
26
|
+
from .write_delta_merge import WriteDeltaMergeAction
|
|
27
|
+
from .write_file import WriteFileAction
|
|
25
28
|
|
|
26
29
|
# Get all subclasses of PipelineAction defined in this submodule
|
|
27
30
|
pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
|
|
@@ -36,7 +39,6 @@ __all__ = [
|
|
|
36
39
|
"ReadExcelAction",
|
|
37
40
|
"ReadFilesAction",
|
|
38
41
|
"ReadMetadataYAMLAction",
|
|
39
|
-
"WriteCatalogTableAction",
|
|
40
42
|
"PipelineActionType",
|
|
41
43
|
"TransformFilterAction",
|
|
42
44
|
"TransformUnionAction",
|
|
@@ -52,5 +54,9 @@ __all__ = [
|
|
|
52
54
|
"TransformRenameColumnsAction",
|
|
53
55
|
"TransformReplaceValuesAction",
|
|
54
56
|
"TransformSelectColumnsAction",
|
|
57
|
+
"WriteCatalogTableAction",
|
|
58
|
+
"WriteDeltaAppendAction",
|
|
59
|
+
"WriteDeltaMergeAction",
|
|
60
|
+
"WriteFileAction",
|
|
55
61
|
"TransformHashColumnsAction",
|
|
56
62
|
]
|
|
@@ -61,8 +61,5 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
61
61
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
62
62
|
|
|
63
63
|
table_reader = CatalogReader()
|
|
64
|
-
df = table_reader.read(
|
|
65
|
-
table_identifier=table_identifier,
|
|
66
|
-
**options,
|
|
67
|
-
)
|
|
64
|
+
df = table_reader.read(table_identifier=table_identifier, options=options)
|
|
68
65
|
return context.from_existing(data=df)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.writer import DeltaAppendWriter
|
|
4
|
+
from ...models.adapter import UnityCatalogAdapter
|
|
5
|
+
from ...pipeline import PipelineAction, PipelineContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WriteDeltaAppendAction(PipelineAction):
|
|
9
|
+
"""This class implements an Append action for an ETL pipeline.
|
|
10
|
+
|
|
11
|
+
The WriteDeltaAppendAction appends a Dataframe to Delta Table.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
name: str = "WRITE_DELTA_APPEND"
|
|
18
|
+
|
|
19
|
+
def run(
|
|
20
|
+
self,
|
|
21
|
+
context: PipelineContext,
|
|
22
|
+
*,
|
|
23
|
+
table_identifier: str | None = None,
|
|
24
|
+
ignore_empty_df: bool = False,
|
|
25
|
+
options: dict[str, Any] | None = None,
|
|
26
|
+
**_: Any,
|
|
27
|
+
) -> PipelineContext:
|
|
28
|
+
"""Merge the dataframe into the delta table.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
context: Context in which this Action is executed.
|
|
32
|
+
table_identifier: The identifier of the table. If passed, the
|
|
33
|
+
UC Adapter will be used to create a table object. Otherwise the Table
|
|
34
|
+
object will be created from the table metadata in the context.
|
|
35
|
+
ignore_empty_df: A flag indicating whether to ignore an empty source dataframe.
|
|
36
|
+
options: Additional options for the append writer.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the table does not exist.
|
|
40
|
+
ValueError: If the data is not set in the pipeline context.
|
|
41
|
+
ValueError: If the table metadata is empty.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Pipeline Context
|
|
45
|
+
"""
|
|
46
|
+
delta_append_writer = DeltaAppendWriter()
|
|
47
|
+
|
|
48
|
+
if context.data is None:
|
|
49
|
+
raise ValueError("Data is required for the append operation.")
|
|
50
|
+
if context.table_metadata is None and table_identifier is None:
|
|
51
|
+
raise ValueError("Table metadata or a table identifier are required for the append operation.")
|
|
52
|
+
|
|
53
|
+
if table_identifier is not None:
|
|
54
|
+
context.table_metadata = UnityCatalogAdapter().get_table_by_name(table_identifier)
|
|
55
|
+
else:
|
|
56
|
+
if context.table_metadata is None:
|
|
57
|
+
raise ValueError("Table metadata is required.")
|
|
58
|
+
|
|
59
|
+
if context.table_metadata is None:
|
|
60
|
+
raise ValueError("Table metadata is required.")
|
|
61
|
+
|
|
62
|
+
delta_append_writer.write(
|
|
63
|
+
table_identifier=context.table_metadata.identifier,
|
|
64
|
+
table_location=context.table_metadata.storage_path,
|
|
65
|
+
data_frame=context.data,
|
|
66
|
+
ignore_empty_df=ignore_empty_df,
|
|
67
|
+
options=options,
|
|
68
|
+
)
|
|
69
|
+
return context.from_existing()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.writer import DeltaMergeWriter
|
|
4
|
+
from ...models.adapter import UnityCatalogAdapter
|
|
5
|
+
from ...pipeline import PipelineAction, PipelineContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WriteDeltaMergeAction(PipelineAction):
|
|
9
|
+
"""This class implements a Merge action for an ETL pipeline.
|
|
10
|
+
|
|
11
|
+
The MergeIntoDeltaAction merges a Dataframe to Delta Table.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
name: str = "WRITE_DELTA_MERGE"
|
|
18
|
+
|
|
19
|
+
def run(
|
|
20
|
+
self,
|
|
21
|
+
context: PipelineContext,
|
|
22
|
+
*,
|
|
23
|
+
table_identifier: str | None = None,
|
|
24
|
+
key_columns: list[str] | None = None,
|
|
25
|
+
cols_to_update: list[str] | None = None,
|
|
26
|
+
cols_to_insert: list[str] | None = None,
|
|
27
|
+
cols_to_exclude: list[str] | None = None,
|
|
28
|
+
when_matched_update: bool = True,
|
|
29
|
+
when_matched_deleted: bool = False,
|
|
30
|
+
when_not_matched_insert: bool = True,
|
|
31
|
+
use_partition_pruning: bool = True,
|
|
32
|
+
ignore_empty_df: bool = False,
|
|
33
|
+
create_if_not_exists: bool = True,
|
|
34
|
+
refresh_table: bool = True,
|
|
35
|
+
**_: Any,
|
|
36
|
+
) -> PipelineContext:
|
|
37
|
+
"""Merge the dataframe into the delta table.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
context: Context in which this Action is executed.
|
|
41
|
+
table_identifier: The identifier of the table. If passed, the
|
|
42
|
+
UC Adapter will be used to create a table object. Otherwise the Table
|
|
43
|
+
object will be created from the table metadata in the context.
|
|
44
|
+
key_columns: List of column names that form the
|
|
45
|
+
key for the merge operation.
|
|
46
|
+
when_matched_update: Flag to specify whether to
|
|
47
|
+
perform an update operation whenmatching records are found in
|
|
48
|
+
the target Delta table.
|
|
49
|
+
when_matched_deleted: Flag to specify whether to
|
|
50
|
+
perform a delete operation when matching records are found in
|
|
51
|
+
the target Delta table.
|
|
52
|
+
when_not_matched_insert: Flag to specify whether to perform an
|
|
53
|
+
insert operation when matching records are not found in the target
|
|
54
|
+
Delta table.
|
|
55
|
+
cols_to_update: List of column names to be
|
|
56
|
+
updated in the target Delta table.
|
|
57
|
+
cols_to_insert: List of column names to be
|
|
58
|
+
inserted into the target Delta table.
|
|
59
|
+
cols_to_exclude: List of column names to be
|
|
60
|
+
excluded from the merge operation.
|
|
61
|
+
use_partition_pruning: Flag to specify whether to use partition
|
|
62
|
+
pruning to optimize the performance of the merge operation.
|
|
63
|
+
ignore_empty_df: A flag indicating whether to ignore an empty source dataframe.
|
|
64
|
+
create_if_not_exists: Create the table if it not exists.
|
|
65
|
+
refresh_table: Refresh the table after the transaction.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
ValueError: If the table does not exist.
|
|
69
|
+
ValueError: If the data is not set in the pipeline context.
|
|
70
|
+
ValueError: If the table metadata is empty.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Pipeline Context
|
|
74
|
+
"""
|
|
75
|
+
delta_merge_writer = DeltaMergeWriter()
|
|
76
|
+
|
|
77
|
+
if context.data is None:
|
|
78
|
+
raise ValueError("Data is required for the merge operation.")
|
|
79
|
+
if context.table_metadata is None and table_identifier is None:
|
|
80
|
+
raise ValueError("Table metadata or a table identifier are required for the merge operation.")
|
|
81
|
+
|
|
82
|
+
if table_identifier is not None:
|
|
83
|
+
context.table_metadata = UnityCatalogAdapter().get_table_by_name(table_identifier)
|
|
84
|
+
else:
|
|
85
|
+
if context.table_metadata is None:
|
|
86
|
+
raise ValueError("Table metadata is required.")
|
|
87
|
+
|
|
88
|
+
if context.table_metadata is None:
|
|
89
|
+
raise ValueError("Table metadata is required.")
|
|
90
|
+
|
|
91
|
+
if create_if_not_exists:
|
|
92
|
+
delta_merge_writer.table_manager.create_table(table=context.table_metadata, ignore_if_exists=True)
|
|
93
|
+
|
|
94
|
+
if not delta_merge_writer.table_manager.table_exists(context.table_metadata):
|
|
95
|
+
raise ValueError(f"Table {context.table_metadata.name} does not exist.")
|
|
96
|
+
|
|
97
|
+
assert key_columns is not None, "Key columns must be provided."
|
|
98
|
+
|
|
99
|
+
delta_merge_writer.write(
|
|
100
|
+
table_identifier=context.table_metadata.identifier,
|
|
101
|
+
storage_path=str(context.table_metadata.storage_path),
|
|
102
|
+
data_frame=context.data,
|
|
103
|
+
key_columns=key_columns,
|
|
104
|
+
cols_to_update=cols_to_update,
|
|
105
|
+
cols_to_insert=cols_to_insert,
|
|
106
|
+
cols_to_exclude=cols_to_exclude,
|
|
107
|
+
when_matched_update=when_matched_update,
|
|
108
|
+
when_matched_deleted=when_matched_deleted,
|
|
109
|
+
when_not_matched_insert=when_not_matched_insert,
|
|
110
|
+
use_partition_pruning=use_partition_pruning,
|
|
111
|
+
partition_by=context.table_metadata.partition_by,
|
|
112
|
+
ignore_empty_df=ignore_empty_df,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if refresh_table:
|
|
116
|
+
delta_merge_writer.table_manager.refresh_table(table_identifier=context.table_metadata.identifier)
|
|
117
|
+
|
|
118
|
+
return context.from_existing()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.writer import FileWriter
|
|
4
|
+
from ...pipeline import PipelineAction, PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class WriteFileAction(PipelineAction):
|
|
8
|
+
"""This class implements a Write action for an ETL pipeline.
|
|
9
|
+
|
|
10
|
+
The WriteFileAction writes a Dataframe to a storage location defined in the
|
|
11
|
+
options using the [`FileWriter`][cloe_nessy.integration.writer.FileWriter] class.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
```yaml
|
|
15
|
+
Write to File:
|
|
16
|
+
action: WRITE_FILE
|
|
17
|
+
options:
|
|
18
|
+
path: "path/to/location"
|
|
19
|
+
format: "parquet"
|
|
20
|
+
partition_cols: ["date"]
|
|
21
|
+
mode: "append"
|
|
22
|
+
is_stream: False
|
|
23
|
+
options:
|
|
24
|
+
mergeSchema: "true"
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "WRITE_FILE"
|
|
29
|
+
|
|
30
|
+
def run(
|
|
31
|
+
self,
|
|
32
|
+
context: PipelineContext,
|
|
33
|
+
*,
|
|
34
|
+
path: str = "",
|
|
35
|
+
format: str = "delta",
|
|
36
|
+
partition_cols: list[str] | None = None,
|
|
37
|
+
mode: str = "append",
|
|
38
|
+
is_stream: bool = False,
|
|
39
|
+
options: dict[str, str] | None = None,
|
|
40
|
+
**_: Any,
|
|
41
|
+
) -> PipelineContext:
|
|
42
|
+
"""Writes a file to a location.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
context: Context in which this Action is executed.
|
|
46
|
+
path: Location to write data to.
|
|
47
|
+
format: Format of files to write.
|
|
48
|
+
partition_cols: Columns to partition on. If None, the writer will try to get the partition
|
|
49
|
+
columns from the metadata. Default None.
|
|
50
|
+
mode: Specifies the behavior when data or table already exists.
|
|
51
|
+
is_stream: If True, use the `write_stream` method of the writer.
|
|
52
|
+
options: Additional options passed to the writer.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If no path is provided.
|
|
56
|
+
ValueError: If the table metadata is empty.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Pipeline Context
|
|
60
|
+
"""
|
|
61
|
+
if not path:
|
|
62
|
+
raise ValueError("No path provided. Please specify path to write data to.")
|
|
63
|
+
if not options:
|
|
64
|
+
options = {}
|
|
65
|
+
|
|
66
|
+
if context.data is None:
|
|
67
|
+
raise ValueError("Data context is required for the operation.")
|
|
68
|
+
|
|
69
|
+
if partition_cols is None:
|
|
70
|
+
if context.table_metadata is None:
|
|
71
|
+
partition_cols = []
|
|
72
|
+
else:
|
|
73
|
+
partition_cols = context.table_metadata.partition_by
|
|
74
|
+
writer = FileWriter()
|
|
75
|
+
if not is_stream:
|
|
76
|
+
writer.write(
|
|
77
|
+
data_frame=context.data,
|
|
78
|
+
location=path,
|
|
79
|
+
format=format,
|
|
80
|
+
partition_cols=partition_cols,
|
|
81
|
+
mode=mode,
|
|
82
|
+
options=options,
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
writer.write_stream(
|
|
86
|
+
data_frame=context.data,
|
|
87
|
+
location=path,
|
|
88
|
+
format=format,
|
|
89
|
+
mode=mode,
|
|
90
|
+
partition_cols=partition_cols,
|
|
91
|
+
options=options,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return context.from_existing()
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
6
6
|
License: MIT
|
|
7
|
+
Project-URL: homepage, https://initions.com/
|
|
7
8
|
Classifier: Development Status :: 5 - Production/Stable
|
|
8
9
|
Classifier: Environment :: Console
|
|
9
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -28,13 +29,13 @@ Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
|
28
29
|
Requires-Dist: networkx<4.0,>=3.3
|
|
29
30
|
Requires-Dist: matplotlib<4.0.0,>=3.9.2
|
|
30
31
|
Requires-Dist: types-networkx<4.0.0.0,>=3.2.1.20240820
|
|
31
|
-
Requires-Dist: fsspec<2025.
|
|
32
|
+
Requires-Dist: fsspec<2025.6.0,>=2025.5.1
|
|
32
33
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.7
|
|
33
34
|
|
|
34
35
|
# cloe-nessy
|
|
35
36
|
|
|
36
37
|
[](https://github.com/copier-org/copier)
|
|
37
|
-
[](https://www.python.org)
|
|
38
39
|
[](https://github.com/astral-sh/uv)
|
|
39
40
|
[](https://github.com/charliermarsh/ruff)
|
|
40
41
|
[](https://mypy-lang.org/)
|
|
@@ -43,4 +44,27 @@ Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.7
|
|
|
43
44
|
|
|
44
45
|
Owner: initions
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
Nessy is diving deep into Data, unleashing insights with ease.
|
|
48
|
+
|
|
49
|
+
## Introduction
|
|
50
|
+
|
|
51
|
+
"Nessy" is a comprehensive Python datalake framework that provides a seamless,
|
|
52
|
+
efficient, and user-friendly platform for managing and analyzing datasets using
|
|
53
|
+
pyspark. Its advanced features allow operations on each level of an
|
|
54
|
+
Extract-Transform-Load (ETL) Workflow.
|
|
55
|
+
|
|
56
|
+
## Contributing
|
|
57
|
+
|
|
58
|
+
When you are contributing, please refer to our Contribution Guide in the *nessy*
|
|
59
|
+
Docs
|
|
60
|
+
[here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
|
|
61
|
+
|
|
62
|
+
## Usage
|
|
63
|
+
|
|
64
|
+
Please find the User Guide
|
|
65
|
+
[here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
|
|
66
|
+
|
|
67
|
+
## Contact
|
|
68
|
+
|
|
69
|
+
Please reach out to the *nessy* Team for any questions around this package and
|
|
70
|
+
repository.
|
|
@@ -18,14 +18,22 @@ cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=RnQjWtWIFzFj-zPq
|
|
|
18
18
|
cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1KVK0TS7FWVrJ3k4OLKh35sCxAU,3194
|
|
19
19
|
cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
cloe_nessy/integration/reader/__init__.py,sha256=J5vlORqHLBpHEvzIwfIjzN5xEdOat-8jlmdLcGj8nsA,239
|
|
21
|
-
cloe_nessy/integration/reader/api_reader.py,sha256=
|
|
22
|
-
cloe_nessy/integration/reader/catalog_reader.py,sha256=
|
|
23
|
-
cloe_nessy/integration/reader/excel_reader.py,sha256=
|
|
21
|
+
cloe_nessy/integration/reader/api_reader.py,sha256=3Mf-txOTJ1dXCzdNtRTLC8UKftKms4NxOoLVgzcc2eo,5691
|
|
22
|
+
cloe_nessy/integration/reader/catalog_reader.py,sha256=lwDeWBVXfFh75XknPawetL9ZBtqS-Oss5rNzbrEeIQg,2070
|
|
23
|
+
cloe_nessy/integration/reader/excel_reader.py,sha256=8KCqKBYFE6RGCiahJimQOAtbYZzaUzlnoslW9yca5P8,8035
|
|
24
24
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
25
|
-
cloe_nessy/integration/reader/file_reader.py,sha256=
|
|
25
|
+
cloe_nessy/integration/reader/file_reader.py,sha256=3DcZhyyL-Cf_R7Px1UDHJwpO8Un31dWey2Q-f4DtWfY,6879
|
|
26
26
|
cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
|
|
27
|
-
cloe_nessy/integration/writer/__init__.py,sha256=
|
|
27
|
+
cloe_nessy/integration/writer/__init__.py,sha256=3yzCAGiWZdQWtsbzlTih01sxVTJV2DDYwvl34lEAUlE,243
|
|
28
28
|
cloe_nessy/integration/writer/catalog_writer.py,sha256=Gb-hMdADgO_uUJ7mZPHBYyNme2qXsdFFnzwo7GcShHM,2192
|
|
29
|
+
cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70To4L6Q182pXx2HRM,5454
|
|
30
|
+
cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
|
|
31
|
+
cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
|
|
32
|
+
cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=TbpW-j87_H9dcUza34uR6VWslJez406y3_5N1ip0SnM,4740
|
|
33
|
+
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=no2GOLqMAJd0fEy2mqMevMj_CvutcJPRmXJC2tD4icA,10112
|
|
34
|
+
cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=kiacqQ2FYQSzakJqZ9-ZHH3os4X7--QuER_2xx9y21k,971
|
|
35
|
+
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=upUtDZMzwYFU0kzmkelVgkpFToXkrypcR3h_jvGjz14,8596
|
|
36
|
+
cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
|
|
29
37
|
cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
|
|
30
38
|
cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
|
|
31
39
|
cloe_nessy/models/__init__.py,sha256=-FmWEJ1Oq1njSopjc0R7GmT64mLSmALkm8PkHNzy9Y8,327
|
|
@@ -33,17 +41,17 @@ cloe_nessy/models/catalog.py,sha256=ayC1sMp4cNLAZtu0ICVV3Us6-o4hn8U9tpzzvxC9RAs,
|
|
|
33
41
|
cloe_nessy/models/column.py,sha256=53fBwRnino72XKACsHZpN9QfCBqqSXyKLHZlM0huumg,1988
|
|
34
42
|
cloe_nessy/models/constraint.py,sha256=hsFlhn4n928z81O3dl3v5bMetewPWzMjkJK3_4kASSM,178
|
|
35
43
|
cloe_nessy/models/foreign_key.py,sha256=DwRVHs9sShqqPV-NL7ow_3AmPPWX0Od26yZn_I565pU,1001
|
|
36
|
-
cloe_nessy/models/schema.py,sha256=
|
|
37
|
-
cloe_nessy/models/table.py,sha256=
|
|
44
|
+
cloe_nessy/models/schema.py,sha256=yUrjjEhAH5zbCymE67Az_jPnVB8hGO-_UNfqzeZCD_Y,3376
|
|
45
|
+
cloe_nessy/models/table.py,sha256=1N79hc79uJbNw5tHuoQAhLLS6y-9TFx5LIQT-C1X-wU,12075
|
|
38
46
|
cloe_nessy/models/types.py,sha256=XRbuJGdTNa6aXyE3IAzs_J9gVjbfkzMDLfGl-k6jI_4,223
|
|
39
|
-
cloe_nessy/models/volume.py,sha256=
|
|
47
|
+
cloe_nessy/models/volume.py,sha256=51BE06FrL1Wv6zblFwJ_HTiR6WQqH7pSmrdH90rqwLg,2444
|
|
40
48
|
cloe_nessy/models/adapter/__init__.py,sha256=m36W_mqwB3dCYnCIt0fLOSHS4E1VU8FRGoaum4Gf95o,90
|
|
41
|
-
cloe_nessy/models/adapter/unity_catalog_adapter.py,sha256=
|
|
49
|
+
cloe_nessy/models/adapter/unity_catalog_adapter.py,sha256=a-14Ys-AevVYQd0xeJU1syLxjT5Wzo4uog1hFSEs76M,12651
|
|
42
50
|
cloe_nessy/models/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
51
|
cloe_nessy/models/mixins/read_instance_mixin.py,sha256=j5Y4aNWOh1jlskEaxNooZFJgPyxRmik00gAVLJnAaRs,4507
|
|
44
52
|
cloe_nessy/models/mixins/template_loader_mixin.py,sha256=5MXhEGBFlq3dwZvINEyBowSlipNnVun2H_TmhI_fsS4,549
|
|
45
53
|
cloe_nessy/object_manager/__init__.py,sha256=3sle0vNpPwBOkycxA3XVS9m4XZf5LD3Qd4NGxdqcHno,186
|
|
46
|
-
cloe_nessy/object_manager/table_manager.py,sha256=
|
|
54
|
+
cloe_nessy/object_manager/table_manager.py,sha256=oYcYiZR0-JyoadcCcDelxfFb-ATeKDIZerYaZc-moiI,12634
|
|
47
55
|
cloe_nessy/object_manager/volume_manager.py,sha256=6epd3KXzcNH04EvaKubAfLsaUm9qBMeT3KNvMK04gGs,2727
|
|
48
56
|
cloe_nessy/pipeline/__init__.py,sha256=sespmJ5JsgyiFyZiedTiL2kg--zGIX7cjTYsD5vemEg,325
|
|
49
57
|
cloe_nessy/pipeline/pipeline.py,sha256=-1tJVs9rZf8CcwieH4IP7mqJZ6mL7bQUZ56TNKt8eO8,11154
|
|
@@ -52,9 +60,9 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
|
|
|
52
60
|
cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
|
|
53
61
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
|
|
54
62
|
cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
|
|
55
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
63
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=9gjSQKLGrPcaYaJrTYZde8d4yNrN1SoXN_DDHq5KrvY,2600
|
|
56
64
|
cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nuAHCuSaGs2s,7778
|
|
57
|
-
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256
|
|
65
|
+
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=oXbqbc6BfR82dSIGclwzWiTN8EVmpFjNIYLKm4qOU50,2754
|
|
58
66
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
|
|
59
67
|
cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
|
|
60
68
|
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
|
|
@@ -75,13 +83,16 @@ cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO
|
|
|
75
83
|
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
76
84
|
cloe_nessy/pipeline/actions/transform_union.py,sha256=s81Vge0AbYPc7VkskCYfOQ_LEjqcmfNFyDkytfjcZyo,2720
|
|
77
85
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
|
|
86
|
+
cloe_nessy/pipeline/actions/write_delta_append.py,sha256=fuL29SK9G5K14ycckU3iPexeK0XNXUfQscCwhXHxbKA,2498
|
|
87
|
+
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=Hir7QZZZJ9hmQZXiJ9iz6u06OCmcHFpyKFVB_I1saSM,5043
|
|
88
|
+
cloe_nessy/pipeline/actions/write_file.py,sha256=H8LRst045yij-8XJ5pRB9m5d1lZpZjFa0WSVdSFesPo,2984
|
|
78
89
|
cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
|
|
79
90
|
cloe_nessy/session/session_manager.py,sha256=f4OeeyGD3becDQGkdDbck3jVH9ulOCBWjW6Jaj_MIrc,7765
|
|
80
91
|
cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
|
|
81
92
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
82
93
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
94
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
84
|
-
cloe_nessy-0.3.
|
|
85
|
-
cloe_nessy-0.3.
|
|
86
|
-
cloe_nessy-0.3.
|
|
87
|
-
cloe_nessy-0.3.
|
|
95
|
+
cloe_nessy-0.3.10.dist-info/METADATA,sha256=NBGGYODGPrVIhK3HBYkRSCUkd3tvBnU0AsYqB2j90Js,3162
|
|
96
|
+
cloe_nessy-0.3.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
97
|
+
cloe_nessy-0.3.10.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
98
|
+
cloe_nessy-0.3.10.dist-info/RECORD,,
|
|
File without changes
|