cloe-nessy 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/api_reader.py +4 -2
- cloe_nessy/integration/reader/catalog_reader.py +6 -3
- cloe_nessy/integration/reader/excel_reader.py +1 -1
- cloe_nessy/integration/reader/file_reader.py +78 -5
- cloe_nessy/integration/writer/__init__.py +8 -1
- cloe_nessy/integration/writer/delta_writer/__init__.py +7 -0
- cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +108 -0
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +215 -0
- cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +21 -0
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +210 -0
- cloe_nessy/integration/writer/delta_writer/exceptions.py +4 -0
- cloe_nessy/integration/writer/file_writer.py +132 -0
- cloe_nessy/integration/writer/writer.py +54 -0
- cloe_nessy/models/adapter/unity_catalog_adapter.py +5 -1
- cloe_nessy/models/schema.py +1 -1
- cloe_nessy/models/table.py +17 -6
- cloe_nessy/object_manager/table_manager.py +73 -19
- cloe_nessy/pipeline/actions/__init__.py +7 -1
- cloe_nessy/pipeline/actions/read_catalog_table.py +1 -4
- cloe_nessy/pipeline/actions/write_delta_append.py +69 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +118 -0
- cloe_nessy/pipeline/actions/write_file.py +94 -0
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.9.dist-info}/METADATA +28 -4
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.9.dist-info}/RECORD +26 -15
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.9.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.8.dist-info → cloe_nessy-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -39,6 +39,7 @@ class APIReader(BaseReader):
|
|
|
39
39
|
|
|
40
40
|
def read(
|
|
41
41
|
self,
|
|
42
|
+
*,
|
|
42
43
|
endpoint: str = "",
|
|
43
44
|
method: str = "GET",
|
|
44
45
|
key: str | None = None,
|
|
@@ -66,7 +67,7 @@ class APIReader(BaseReader):
|
|
|
66
67
|
max_retries: The maximum number of retries for the request.
|
|
67
68
|
options: Additional options for the createDataFrame function.
|
|
68
69
|
add_metadata_column: If set, adds a __metadata column containing metadata about the API response.
|
|
69
|
-
kwargs:
|
|
70
|
+
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
70
71
|
|
|
71
72
|
Returns:
|
|
72
73
|
DataFrame: The Spark DataFrame containing the read data in the json_object column.
|
|
@@ -74,7 +75,8 @@ class APIReader(BaseReader):
|
|
|
74
75
|
Raises:
|
|
75
76
|
RuntimeError: If there is an error with the API request or reading the data.
|
|
76
77
|
"""
|
|
77
|
-
|
|
78
|
+
if options is None:
|
|
79
|
+
options = {}
|
|
78
80
|
try:
|
|
79
81
|
response = self.api_client.request(
|
|
80
82
|
method=method,
|
|
@@ -17,12 +17,13 @@ class CatalogReader(BaseReader):
|
|
|
17
17
|
"""Initializes the CatalogReader object."""
|
|
18
18
|
super().__init__()
|
|
19
19
|
|
|
20
|
-
def read(self, table_identifier: str = "", **kwargs: Any) -> DataFrame:
|
|
20
|
+
def read(self, table_identifier: str = "", *, options: dict[str, str] | None = None, **kwargs: Any) -> DataFrame:
|
|
21
21
|
"""Reads a table from the Unity Catalog.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
|
|
25
|
-
|
|
25
|
+
options: PySpark options for the read table operation (not used in the current implementation).
|
|
26
|
+
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
26
27
|
|
|
27
28
|
Returns:
|
|
28
29
|
The Spark DataFrame containing the read data.
|
|
@@ -31,6 +32,8 @@ class CatalogReader(BaseReader):
|
|
|
31
32
|
ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
|
|
32
33
|
Exception: For any other unexpected errors.
|
|
33
34
|
"""
|
|
35
|
+
if options is None:
|
|
36
|
+
options = {}
|
|
34
37
|
if not table_identifier:
|
|
35
38
|
raise ValueError("table_identifier is required")
|
|
36
39
|
if not isinstance(table_identifier, str):
|
|
@@ -39,7 +42,7 @@ class CatalogReader(BaseReader):
|
|
|
39
42
|
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
40
43
|
|
|
41
44
|
try:
|
|
42
|
-
df = self._spark.read.table(table_identifier)
|
|
45
|
+
df = self._spark.read.table(table_identifier, **options)
|
|
43
46
|
return df
|
|
44
47
|
except AnalysisException as err:
|
|
45
48
|
raise ValueError(f"Table not found: {table_identifier}") from err
|
|
@@ -92,7 +92,7 @@ class ExcelDataFrameReader(BaseReader):
|
|
|
92
92
|
pyspark.pandas.read_excel and handed to TextFileReader.
|
|
93
93
|
load_as_strings: If True, converts all columns to string type to avoid datatype conversion errors in Spark.
|
|
94
94
|
add_metadata_column: If True, adds a metadata column containing the file location and sheet name.
|
|
95
|
-
kwargs:
|
|
95
|
+
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
96
96
|
"""
|
|
97
97
|
if options is None:
|
|
98
98
|
options = {}
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
import pyspark.sql.functions as F
|
|
4
|
-
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql import DataFrame, DataFrameReader
|
|
5
|
+
from pyspark.sql.streaming import DataStreamReader
|
|
6
|
+
from pyspark.sql.types import StructType
|
|
5
7
|
|
|
6
8
|
from ...file_utilities import get_file_paths
|
|
7
9
|
from .reader import BaseReader
|
|
@@ -17,9 +19,18 @@ class FileReader(BaseReader):
|
|
|
17
19
|
"""Initializes the FileReader object."""
|
|
18
20
|
super().__init__()
|
|
19
21
|
|
|
22
|
+
def _get_reader(self) -> DataFrameReader:
|
|
23
|
+
"""Returns a DataFrameReader."""
|
|
24
|
+
return self._spark.read
|
|
25
|
+
|
|
26
|
+
def _get_stream_reader(self) -> DataStreamReader:
|
|
27
|
+
"""Returns a DataFrameReader."""
|
|
28
|
+
return self._spark.readStream
|
|
29
|
+
|
|
20
30
|
def read(
|
|
21
31
|
self,
|
|
22
32
|
location: str,
|
|
33
|
+
*,
|
|
23
34
|
spark_format: str | None = None,
|
|
24
35
|
extension: str | None = None,
|
|
25
36
|
schema: str | None = None,
|
|
@@ -38,7 +49,22 @@ class FileReader(BaseReader):
|
|
|
38
49
|
search_subdirs: Whether to include files in subdirectories.
|
|
39
50
|
options: Spark DataFrame reader options.
|
|
40
51
|
add_metadata_column: Whether to include __metadata column in the DataFrame.
|
|
41
|
-
kwargs:
|
|
52
|
+
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If neither spark_format nor extension is provided.
|
|
56
|
+
ValueError: If the provided extension is not supported.
|
|
57
|
+
Exception: If there is an error while reading the files.
|
|
58
|
+
|
|
59
|
+
Note:
|
|
60
|
+
- The `spark_format` parameter is used to specify the format of the files to be read.
|
|
61
|
+
- If `spark_format` is not provided, the method will try to infer it from the `extension`.
|
|
62
|
+
- The `extension` parameter is used to specify the file extension (e.g., 'csv', 'json', etc.).
|
|
63
|
+
- If both `spark_format` and `extension` are provided, `spark_format` will take precedence.
|
|
64
|
+
- The method will raise an error if neither `spark_format` nor `extension` is provided.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A DataFrame containing the data from the files.
|
|
42
68
|
"""
|
|
43
69
|
if options is None:
|
|
44
70
|
options = {}
|
|
@@ -67,7 +93,7 @@ class FileReader(BaseReader):
|
|
|
67
93
|
self._console_logger.debug(f"File paths: {file_paths}")
|
|
68
94
|
assert spark_format is not None
|
|
69
95
|
|
|
70
|
-
reader = self.
|
|
96
|
+
reader = self._get_reader().format(spark_format)
|
|
71
97
|
if schema:
|
|
72
98
|
reader.schema(schema)
|
|
73
99
|
else:
|
|
@@ -78,7 +104,7 @@ class FileReader(BaseReader):
|
|
|
78
104
|
|
|
79
105
|
try:
|
|
80
106
|
self._console_logger.debug("Loading files into DataFrame")
|
|
81
|
-
df = reader.load(file_paths)
|
|
107
|
+
df = reader.load([str(p) for p in file_paths])
|
|
82
108
|
self._console_logger.debug("Successfully loaded files into DataFrame")
|
|
83
109
|
if add_metadata_column:
|
|
84
110
|
df = self._add_metadata_column(df)
|
|
@@ -89,9 +115,56 @@ class FileReader(BaseReader):
|
|
|
89
115
|
self._console_logger.info(f"Successfully read files from [ '{location}' ]")
|
|
90
116
|
return df
|
|
91
117
|
|
|
118
|
+
def read_stream(
|
|
119
|
+
self,
|
|
120
|
+
location: str = "",
|
|
121
|
+
schema: StructType | str | None = None,
|
|
122
|
+
format: str = "delta",
|
|
123
|
+
add_metadata_column: bool = False,
|
|
124
|
+
options: dict[str, Any] | None = None,
|
|
125
|
+
**_: Any,
|
|
126
|
+
) -> DataFrame:
|
|
127
|
+
"""Reads specified location as a stream and returns streaming DataFrame.
|
|
128
|
+
|
|
129
|
+
Arguments:
|
|
130
|
+
location : Location of files to read.
|
|
131
|
+
format: Format of files to read.
|
|
132
|
+
schema: Schema of the file.
|
|
133
|
+
add_metadata_column: Whether to include __metadata column in the DataFrame.
|
|
134
|
+
options: Spark DataFrame reader options.
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
ValueError: If location is not provided.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
A Streaming DataFrame
|
|
141
|
+
"""
|
|
142
|
+
if not location:
|
|
143
|
+
raise ValueError("Location is required for streaming.")
|
|
144
|
+
self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
|
|
145
|
+
try:
|
|
146
|
+
if options is None:
|
|
147
|
+
options = {}
|
|
148
|
+
reader = self._get_stream_reader()
|
|
149
|
+
reader.format(format)
|
|
150
|
+
reader.option("rescuedDataColumn", "_rescued_data")
|
|
151
|
+
if schema is None:
|
|
152
|
+
options["inferSchema"] = True
|
|
153
|
+
else:
|
|
154
|
+
reader.schema(schema)
|
|
155
|
+
reader.options(**options)
|
|
156
|
+
df = reader.load(location)
|
|
157
|
+
if add_metadata_column:
|
|
158
|
+
df = self._add_metadata_column(df)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self._console_logger.error(f"Failed to read files from [ '{location}' ]: {e}")
|
|
161
|
+
raise
|
|
162
|
+
else:
|
|
163
|
+
self._console_logger.info(f"Successfully read files from [ '{location}' ]")
|
|
164
|
+
return df
|
|
165
|
+
|
|
92
166
|
def _add_metadata_column(self, df: DataFrame) -> DataFrame:
|
|
93
167
|
"""Add all metadata columns to the DataFrame."""
|
|
94
|
-
# Extract metadata fields into separate columns
|
|
95
168
|
metadata_columns = df.select("_metadata.*").columns
|
|
96
169
|
|
|
97
170
|
entries = [(F.lit(field), F.col(f"_metadata.{field}")) for field in metadata_columns]
|
|
@@ -1,3 +1,10 @@
|
|
|
1
1
|
from .catalog_writer import CatalogWriter
|
|
2
|
+
from .delta_writer import DeltaAppendWriter, DeltaMergeWriter
|
|
3
|
+
from .file_writer import FileWriter
|
|
2
4
|
|
|
3
|
-
__all__ = [
|
|
5
|
+
__all__ = [
|
|
6
|
+
"CatalogWriter",
|
|
7
|
+
"DeltaAppendWriter",
|
|
8
|
+
"DeltaMergeWriter",
|
|
9
|
+
"FileWriter",
|
|
10
|
+
]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
from ....object_manager import table_log_decorator
|
|
4
|
+
from ....session import SessionManager
|
|
5
|
+
from ..file_writer import FileWriter
|
|
6
|
+
from .delta_table_operation_type import DeltaTableOperationType
|
|
7
|
+
from .delta_writer_base import BaseDeltaWriter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DeltaAppendWriter(BaseDeltaWriter):
|
|
11
|
+
"""A class for appending DataFrames to Delta tables."""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
super().__init__()
|
|
15
|
+
self._spark = SessionManager.get_spark_session()
|
|
16
|
+
self._dbutils = SessionManager.get_utils()
|
|
17
|
+
|
|
18
|
+
@table_log_decorator(operation="append")
|
|
19
|
+
def write(
|
|
20
|
+
self,
|
|
21
|
+
table_identifier: str,
|
|
22
|
+
table_location: str,
|
|
23
|
+
data_frame: DataFrame,
|
|
24
|
+
ignore_empty_df: bool = False,
|
|
25
|
+
options: dict[str, str] | None = None,
|
|
26
|
+
):
|
|
27
|
+
"""Appends the provided DataFrame to a Delta table.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
|
|
31
|
+
table_location: The location of the Delta table.
|
|
32
|
+
data_frame: The DataFrame to append to the table.
|
|
33
|
+
ignore_empty_df: If True, the function returns early without
|
|
34
|
+
doing anything if the DataFrame is empty.
|
|
35
|
+
options: Additional keyword arguments that will be passed to the 'write' method of the
|
|
36
|
+
FileDataFrameWriter instance. These can be any parameters accepted by the 'write'
|
|
37
|
+
method, which could include options for configuring the write operation, such as
|
|
38
|
+
'checkpointLocation' for specifying the path where checkpoints will be stored, or
|
|
39
|
+
'path' for specifying the path where the output data will be written.
|
|
40
|
+
"""
|
|
41
|
+
if self._empty_dataframe_check(data_frame, ignore_empty_df):
|
|
42
|
+
return
|
|
43
|
+
writer = FileWriter()
|
|
44
|
+
writer.write(
|
|
45
|
+
data_frame=data_frame,
|
|
46
|
+
location=table_location,
|
|
47
|
+
format="DELTA",
|
|
48
|
+
mode="APPEND",
|
|
49
|
+
options=options,
|
|
50
|
+
)
|
|
51
|
+
self._report_delta_table_operation_metrics(
|
|
52
|
+
table_identifier=table_identifier, operation_type=DeltaTableOperationType.WRITE
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@table_log_decorator(operation="stream_append")
|
|
56
|
+
def write_stream(
|
|
57
|
+
self,
|
|
58
|
+
table_identifier: str,
|
|
59
|
+
table_location: str,
|
|
60
|
+
data_frame: DataFrame,
|
|
61
|
+
checkpoint_location: str | None = None,
|
|
62
|
+
trigger_dict: dict | None = None,
|
|
63
|
+
options: dict[str, str] | None = None,
|
|
64
|
+
await_termination: bool = False,
|
|
65
|
+
):
|
|
66
|
+
"""Appends the provided DataFrame to a Delta table.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
|
|
70
|
+
table_location: The location of the Delta table.
|
|
71
|
+
data_frame: The DataFrame to append to the table.
|
|
72
|
+
checkpoint_location: Location of checkpoint. If None, defaults
|
|
73
|
+
to the location of the table being written, with '_checkpoint_'
|
|
74
|
+
added before name. Default None.
|
|
75
|
+
trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
|
|
76
|
+
Supported keys include:
|
|
77
|
+
|
|
78
|
+
- "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
|
|
79
|
+
- "once": Processes all available data once and then stops.
|
|
80
|
+
- "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
|
|
81
|
+
- "availableNow": Processes all available data immediately and then stops.
|
|
82
|
+
|
|
83
|
+
If nothing is provided, the default is {"availableNow": True}.
|
|
84
|
+
options: Additional keyword arguments that will be passed to the
|
|
85
|
+
'write' method of the FileDataFrameWriter instance. These can be
|
|
86
|
+
any parameters accepted by the 'write' method, which could
|
|
87
|
+
include options for configuring the write operation.
|
|
88
|
+
await_termination: If True, the function will wait for the streaming
|
|
89
|
+
query to finish before returning. This is useful for ensuring that
|
|
90
|
+
the data has been fully written before proceeding with other
|
|
91
|
+
operations.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
None.
|
|
95
|
+
"""
|
|
96
|
+
writer = FileWriter()
|
|
97
|
+
writer.write_stream(
|
|
98
|
+
data_frame=data_frame,
|
|
99
|
+
location=table_location,
|
|
100
|
+
format="DELTA",
|
|
101
|
+
checkpoint_location=checkpoint_location,
|
|
102
|
+
mode="APPEND",
|
|
103
|
+
trigger_dict=trigger_dict,
|
|
104
|
+
options=options,
|
|
105
|
+
)
|
|
106
|
+
self._report_delta_table_operation_metrics(
|
|
107
|
+
table_identifier=table_identifier, operation_type=DeltaTableOperationType.WRITE
|
|
108
|
+
)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from typing import Any, Self
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
|
|
6
|
+
from ....models import Table
|
|
7
|
+
from ....object_manager import table_log_decorator
|
|
8
|
+
from ....session import SessionManager
|
|
9
|
+
from .delta_table_operation_type import DeltaTableOperationType
|
|
10
|
+
from .delta_writer_base import BaseDeltaWriter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DeltaMergeConfig(BaseModel):
|
|
14
|
+
"""Configuration for Merge options.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
dataframe_columns: The columns of the DataFrame.
|
|
18
|
+
key_columns: List of column names that form the key for the merge
|
|
19
|
+
operation.
|
|
20
|
+
when_matched_update: Flag to specify whether to perform an update
|
|
21
|
+
operation when matching records are found in the target Delta table.
|
|
22
|
+
when_matched_delete: Flag to specify whether to perform a delete
|
|
23
|
+
operation when matching records are found in the target Delta table.
|
|
24
|
+
when_not_matched_insert: Flag to specify whether to perform an insert
|
|
25
|
+
operation when matching records are not found in the target Delta
|
|
26
|
+
table.
|
|
27
|
+
cols_to_exclude_from_update: List of column names to be excluded from
|
|
28
|
+
the update in the target Delta table.
|
|
29
|
+
use_partition_pruning: Flag to specify whether to use partition
|
|
30
|
+
pruning to optimize the performance of the merge operation.
|
|
31
|
+
partition_by: List of column names to partition by.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
dataframe_columns: list[str]
|
|
35
|
+
key_columns: list[str]
|
|
36
|
+
cols_to_exclude_from_update: list[str] = Field(default_factory=list)
|
|
37
|
+
when_matched_update: bool = True
|
|
38
|
+
when_matched_delete: bool = False
|
|
39
|
+
when_not_matched_insert: bool = True
|
|
40
|
+
use_partition_pruning: bool = True
|
|
41
|
+
partition_by: list[str] = Field(default_factory=list)
|
|
42
|
+
cols_to_merge: list[str] = Field(default_factory=list, alias="_cols_to_merge")
|
|
43
|
+
cols_to_update: set[str] = Field(default_factory=set, alias="_cols_to_update")
|
|
44
|
+
cols_to_insert: set[str] = Field(default_factory=set, alias="_cols_to_insert")
|
|
45
|
+
final_cols_to_update: dict[str, str] = Field(default_factory=dict)
|
|
46
|
+
final_cols_to_insert: dict[str, str] = Field(default_factory=dict)
|
|
47
|
+
|
|
48
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
49
|
+
|
|
50
|
+
@model_validator(mode="before")
|
|
51
|
+
@classmethod
|
|
52
|
+
def _validate_update_delete(cls, config: Any):
|
|
53
|
+
"""Update and delete operations must be mutually exclusive."""
|
|
54
|
+
if config.get("when_matched_update") and config.get("when_matched_delete"):
|
|
55
|
+
raise ValueError("Update and delete operations cannot be used together.")
|
|
56
|
+
return config
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="before")
|
|
59
|
+
@classmethod
|
|
60
|
+
def _validate_key_columns(cls, config: Any):
|
|
61
|
+
"""Key columns must exist in the data frame."""
|
|
62
|
+
key_columns = config.get("key_columns")
|
|
63
|
+
dataframe_columns = config.get("dataframe_columns")
|
|
64
|
+
if not set(key_columns).issubset(set(dataframe_columns)):
|
|
65
|
+
raise ValueError("Key columns must exist in the DataFrame columns.")
|
|
66
|
+
return config
|
|
67
|
+
|
|
68
|
+
@model_validator(mode="before")
|
|
69
|
+
@classmethod
|
|
70
|
+
def _derive_merge_columns(cls, config: Any):
|
|
71
|
+
"""Derive update and insert columns from the DataFrame columns."""
|
|
72
|
+
dataframe_columns = config.get("dataframe_columns", [])
|
|
73
|
+
config["_cols_to_merge"] = list(set(dataframe_columns))
|
|
74
|
+
if config.get("cols_to_exclude_from_update"):
|
|
75
|
+
config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
|
|
76
|
+
else:
|
|
77
|
+
config["_cols_to_update"] = set(config["_cols_to_merge"])
|
|
78
|
+
|
|
79
|
+
config["_cols_to_insert"] = config["_cols_to_merge"]
|
|
80
|
+
config["final_cols_to_update"] = {col: f"source.{col}" for col in config["_cols_to_update"]}
|
|
81
|
+
config["final_cols_to_insert"] = {col: f"source.{col}" for col in config["_cols_to_insert"]}
|
|
82
|
+
return config
|
|
83
|
+
|
|
84
|
+
@model_validator(mode="after")
|
|
85
|
+
@classmethod
|
|
86
|
+
def _validate_partition_pruning(cls, config: Self):
|
|
87
|
+
"""If partition_pruning is set, the partition by columns must be known."""
|
|
88
|
+
if config.use_partition_pruning is True and not config.partition_by:
|
|
89
|
+
raise ValueError("Partition columns must be specified when using partition pruning.")
|
|
90
|
+
return config
|
|
91
|
+
|
|
92
|
+
@model_validator(mode="after")
|
|
93
|
+
@classmethod
|
|
94
|
+
def _validate_cols_exist(cls, config: Any):
|
|
95
|
+
"""If partition_pruning is set, the partition by columns must be known."""
|
|
96
|
+
if any(col not in config.cols_to_merge for col in config.cols_to_update) or any(
|
|
97
|
+
col not in config.cols_to_merge for col in config.cols_to_insert
|
|
98
|
+
):
|
|
99
|
+
raise ValueError(
|
|
100
|
+
"You specified column names for UPDATE or INSERT that either don't exist in the dataframe "
|
|
101
|
+
"or are explicitly excluded from the MERGE.",
|
|
102
|
+
)
|
|
103
|
+
return config
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class DeltaMergeWriter(BaseDeltaWriter):
|
|
107
|
+
"""A class for merging DataFrames to Delta tables."""
|
|
108
|
+
|
|
109
|
+
def __init__(self):
|
|
110
|
+
super().__init__()
|
|
111
|
+
self._spark = SessionManager.get_spark_session()
|
|
112
|
+
self._dbutils = SessionManager.get_utils()
|
|
113
|
+
|
|
114
|
+
def _validate_table_inputs(
|
|
115
|
+
self, table: Table | None, table_identifier: str | None, storage_path: str | None
|
|
116
|
+
) -> tuple[str, str]:
|
|
117
|
+
"""Validates and retrieves table identifier and storage path."""
|
|
118
|
+
if table is None and (table_identifier is None or storage_path is None):
|
|
119
|
+
raise ValueError("Either a Table object or table_identifier and storage_path must be provided.")
|
|
120
|
+
if table is not None:
|
|
121
|
+
table_identifier = table.identifier
|
|
122
|
+
storage_path = str(table.storage_path)
|
|
123
|
+
if not storage_path:
|
|
124
|
+
raise ValueError("Storage path must be provided or extracted from the Table object.")
|
|
125
|
+
assert table_identifier is not None, "Table identifier must be provided."
|
|
126
|
+
return table_identifier, storage_path
|
|
127
|
+
|
|
128
|
+
def _build_match_conditions(self, data_frame: DataFrame, config: DeltaMergeConfig) -> str:
|
|
129
|
+
"""Builds match conditions for the Delta table merge."""
|
|
130
|
+
match_conditions = self._merge_match_conditions(config.key_columns)
|
|
131
|
+
if config.use_partition_pruning:
|
|
132
|
+
match_conditions_list = [match_conditions] + [
|
|
133
|
+
self._partition_pruning_conditions(data_frame, config.partition_by),
|
|
134
|
+
]
|
|
135
|
+
match_conditions = " AND ".join(match_conditions_list)
|
|
136
|
+
return match_conditions
|
|
137
|
+
|
|
138
|
+
def _build_merge_operations(
|
|
139
|
+
self, delta_table, data_frame: DataFrame, config: DeltaMergeConfig, match_conditions: str
|
|
140
|
+
):
|
|
141
|
+
"""Builds the Delta table merge operations."""
|
|
142
|
+
delta_table_merge = delta_table.alias("target").merge(
|
|
143
|
+
source=data_frame.alias("source"),
|
|
144
|
+
condition=match_conditions,
|
|
145
|
+
)
|
|
146
|
+
if config.when_matched_update:
|
|
147
|
+
delta_table_merge = delta_table_merge.whenMatchedUpdate(set=config.final_cols_to_update)
|
|
148
|
+
elif config.when_matched_delete:
|
|
149
|
+
delta_table_merge = delta_table_merge.whenMatchedDelete()
|
|
150
|
+
if config.when_not_matched_insert:
|
|
151
|
+
delta_table_merge = delta_table_merge.whenNotMatchedInsert(values=config.final_cols_to_insert)
|
|
152
|
+
return delta_table_merge
|
|
153
|
+
|
|
154
|
+
@table_log_decorator(operation="merge")
|
|
155
|
+
def write(
|
|
156
|
+
self,
|
|
157
|
+
data_frame: DataFrame,
|
|
158
|
+
table: Table | None = None,
|
|
159
|
+
table_identifier: str | None = None,
|
|
160
|
+
storage_path: str | None = None,
|
|
161
|
+
ignore_empty_df: bool = False,
|
|
162
|
+
**kwargs: Any,
|
|
163
|
+
):
|
|
164
|
+
"""Merges the data in a spark DataFrame into a Delta table.
|
|
165
|
+
|
|
166
|
+
This function performs a merge operation between a DataFrame and a Delta
|
|
167
|
+
table. The function supports update, delete, and insert operations on
|
|
168
|
+
the target Delta table based on conditions specified by the user. The
|
|
169
|
+
function also supports partition pruning to optimize the performance of
|
|
170
|
+
the merge operation.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
table: The Table object representing the Delta table.
|
|
174
|
+
table_identifier: The identifier of the Delta table in the format
|
|
175
|
+
'catalog.schema.table'.
|
|
176
|
+
storage_path: The location of the Delta table.
|
|
177
|
+
data_frame: The DataFrame to be merged into the Delta table.
|
|
178
|
+
ignore_empty_df: A flag indicating whether to ignore an empty source
|
|
179
|
+
dataframe.
|
|
180
|
+
kwargs: Passed to the
|
|
181
|
+
[`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_merge_writer.DeltaMergeConfig].
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
ValueError: If both, table and table_identifier or storage_path are provided.
|
|
185
|
+
EmptyDataframeException: If the source dataframe is empty and
|
|
186
|
+
ignore_empty_df is False.
|
|
187
|
+
ValueError: If the specified columns for update or insert do not
|
|
188
|
+
exist in the DataFrame or are explicitly excluded from the
|
|
189
|
+
merge operation.
|
|
190
|
+
ValueError: If partition columns are not specified when using
|
|
191
|
+
partition pruning.
|
|
192
|
+
"""
|
|
193
|
+
if self._empty_dataframe_check(data_frame, ignore_empty_df):
|
|
194
|
+
return
|
|
195
|
+
table_identifier, storage_path = self._validate_table_inputs(table, table_identifier, storage_path)
|
|
196
|
+
|
|
197
|
+
config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
|
|
198
|
+
|
|
199
|
+
delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
|
|
200
|
+
|
|
201
|
+
match_conditions = self._build_match_conditions(data_frame, config)
|
|
202
|
+
|
|
203
|
+
delta_table_merge = self._build_merge_operations(delta_table, data_frame, config, match_conditions)
|
|
204
|
+
delta_table_merge.execute()
|
|
205
|
+
self._report_delta_table_operation_metrics(
|
|
206
|
+
table_identifier,
|
|
207
|
+
operation_type=DeltaTableOperationType.MERGE,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
@table_log_decorator(operation="stream_merge")
|
|
211
|
+
def write_stream(self):
|
|
212
|
+
"""Not implemented yet. See docs for more details."""
|
|
213
|
+
raise NotImplementedError(
|
|
214
|
+
"Streaming merge is not implemented yet. Please use the `write` method for batch merges."
|
|
215
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DeltaTableOperationType(Enum):
|
|
5
|
+
"""Mapping between Delta table operation types and their operation metric keys available in the Delta table history.
|
|
6
|
+
|
|
7
|
+
Values of metric keys included in this mapping are reported using the
|
|
8
|
+
logging capabilities of the Delta operations of the DeltaManager.
|
|
9
|
+
|
|
10
|
+
See https://docs.databricks.com/delta/history.html for a complete list and
|
|
11
|
+
description of available metrics for each operation type.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
UPDATE = ["numUpdatedRows"]
|
|
15
|
+
DELETE = ["numDeletedRows", "numRemovedFiles"]
|
|
16
|
+
MERGE = ["numSourceRows", "numTargetRowsInserted", "numTargetRowsUpdated", "numTargetRowsDeleted", "numOutputRows"]
|
|
17
|
+
WRITE = ["numOutputRows"]
|
|
18
|
+
TRUNCATE = ["numRemovedFiles"]
|
|
19
|
+
OPTIMIZE = ["numAddedFiles", "numRemovedFiles", "minFileSize", "p50FileSize", "maxFileSize"]
|
|
20
|
+
VACUUM = ["numDeletedFiles"]
|
|
21
|
+
STREAMING_UPDATE = ["numRemovedFiles", "numOutputRows", "numOutputBytes", "numAddedFiles"]
|