cloe-nessy 0.3.5__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +0 -0
  3. cloe_nessy/clients/api_client/__init__.py +0 -0
  4. cloe_nessy/clients/api_client/api_client.py +0 -0
  5. cloe_nessy/clients/api_client/api_response.py +0 -0
  6. cloe_nessy/clients/api_client/auth.py +0 -0
  7. cloe_nessy/clients/api_client/exceptions.py +0 -0
  8. cloe_nessy/file_utilities/__init__.py +0 -0
  9. cloe_nessy/file_utilities/exceptions.py +0 -0
  10. cloe_nessy/file_utilities/factory.py +0 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +0 -0
  12. cloe_nessy/file_utilities/location_types.py +0 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +0 -0
  20. cloe_nessy/integration/reader/api_reader.py +4 -2
  21. cloe_nessy/integration/reader/catalog_reader.py +6 -3
  22. cloe_nessy/integration/reader/excel_reader.py +1 -1
  23. cloe_nessy/integration/reader/exceptions.py +0 -0
  24. cloe_nessy/integration/reader/file_reader.py +78 -5
  25. cloe_nessy/integration/reader/reader.py +0 -0
  26. cloe_nessy/integration/writer/__init__.py +8 -1
  27. cloe_nessy/integration/writer/catalog_writer.py +0 -0
  28. cloe_nessy/integration/writer/delta_writer/__init__.py +7 -0
  29. cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +108 -0
  30. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +215 -0
  31. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +21 -0
  32. cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +210 -0
  33. cloe_nessy/integration/writer/delta_writer/exceptions.py +4 -0
  34. cloe_nessy/integration/writer/file_writer.py +132 -0
  35. cloe_nessy/integration/writer/writer.py +54 -0
  36. cloe_nessy/logging/__init__.py +0 -0
  37. cloe_nessy/logging/logger_mixin.py +0 -0
  38. cloe_nessy/models/__init__.py +4 -0
  39. cloe_nessy/models/adapter/__init__.py +3 -0
  40. cloe_nessy/models/adapter/unity_catalog_adapter.py +296 -0
  41. cloe_nessy/models/catalog.py +10 -0
  42. cloe_nessy/models/column.py +0 -0
  43. cloe_nessy/models/constraint.py +0 -0
  44. cloe_nessy/models/foreign_key.py +0 -0
  45. cloe_nessy/models/mixins/__init__.py +0 -0
  46. cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
  47. cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
  48. cloe_nessy/models/schema.py +20 -1
  49. cloe_nessy/models/table.py +67 -11
  50. cloe_nessy/models/types.py +0 -0
  51. cloe_nessy/models/volume.py +67 -0
  52. cloe_nessy/object_manager/__init__.py +7 -2
  53. cloe_nessy/object_manager/table_manager.py +251 -21
  54. cloe_nessy/object_manager/volume_manager.py +70 -0
  55. cloe_nessy/pipeline/__init__.py +0 -0
  56. cloe_nessy/pipeline/actions/__init__.py +9 -1
  57. cloe_nessy/pipeline/actions/read_api.py +0 -0
  58. cloe_nessy/pipeline/actions/read_catalog_table.py +1 -4
  59. cloe_nessy/pipeline/actions/read_excel.py +0 -0
  60. cloe_nessy/pipeline/actions/read_files.py +0 -0
  61. cloe_nessy/pipeline/actions/read_metadata_yaml.py +0 -0
  62. cloe_nessy/pipeline/actions/transform_change_datatype.py +0 -0
  63. cloe_nessy/pipeline/actions/transform_clean_column_names.py +0 -0
  64. cloe_nessy/pipeline/actions/transform_concat_columns.py +0 -0
  65. cloe_nessy/pipeline/actions/transform_decode.py +0 -0
  66. cloe_nessy/pipeline/actions/transform_deduplication.py +0 -0
  67. cloe_nessy/pipeline/actions/transform_distinct.py +0 -0
  68. cloe_nessy/pipeline/actions/transform_filter.py +0 -0
  69. cloe_nessy/pipeline/actions/transform_generic_sql.py +0 -0
  70. cloe_nessy/pipeline/actions/transform_group_aggregate.py +0 -0
  71. cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
  72. cloe_nessy/pipeline/actions/transform_join.py +0 -0
  73. cloe_nessy/pipeline/actions/transform_json_normalize.py +0 -0
  74. cloe_nessy/pipeline/actions/transform_rename_columns.py +0 -0
  75. cloe_nessy/pipeline/actions/transform_replace_values.py +0 -0
  76. cloe_nessy/pipeline/actions/transform_select_columns.py +0 -0
  77. cloe_nessy/pipeline/actions/transform_union.py +0 -0
  78. cloe_nessy/pipeline/actions/write_catalog_table.py +0 -0
  79. cloe_nessy/pipeline/actions/write_delta_append.py +69 -0
  80. cloe_nessy/pipeline/actions/write_delta_merge.py +118 -0
  81. cloe_nessy/pipeline/actions/write_file.py +94 -0
  82. cloe_nessy/pipeline/pipeline.py +44 -2
  83. cloe_nessy/pipeline/pipeline_action.py +0 -0
  84. cloe_nessy/pipeline/pipeline_config.py +0 -0
  85. cloe_nessy/pipeline/pipeline_context.py +0 -0
  86. cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
  87. cloe_nessy/pipeline/pipeline_step.py +0 -0
  88. cloe_nessy/py.typed +0 -0
  89. cloe_nessy/session/__init__.py +0 -0
  90. cloe_nessy/session/session_manager.py +27 -0
  91. cloe_nessy/settings/__init__.py +0 -0
  92. cloe_nessy/settings/settings.py +0 -0
  93. cloe_nessy/utils/__init__.py +0 -0
  94. cloe_nessy/utils/file_and_directory_handler.py +0 -0
  95. cloe_nessy-0.3.9.dist-info/METADATA +70 -0
  96. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/RECORD +35 -18
  97. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/WHEEL +1 -1
  98. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/top_level.txt +0 -0
  99. cloe_nessy-0.3.5.dist-info/METADATA +0 -26
cloe_nessy/__init__.py CHANGED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -39,6 +39,7 @@ class APIReader(BaseReader):
39
39
 
40
40
  def read(
41
41
  self,
42
+ *,
42
43
  endpoint: str = "",
43
44
  method: str = "GET",
44
45
  key: str | None = None,
@@ -66,7 +67,7 @@ class APIReader(BaseReader):
66
67
  max_retries: The maximum number of retries for the request.
67
68
  options: Additional options for the createDataFrame function.
68
69
  add_metadata_column: If set, adds a __metadata column containing metadata about the API response.
69
- kwargs: This method does not accept any additional keyword arguments.
70
+ **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
70
71
 
71
72
  Returns:
72
73
  DataFrame: The Spark DataFrame containing the read data in the json_object column.
@@ -74,7 +75,8 @@ class APIReader(BaseReader):
74
75
  Raises:
75
76
  RuntimeError: If there is an error with the API request or reading the data.
76
77
  """
77
- options = options or {}
78
+ if options is None:
79
+ options = {}
78
80
  try:
79
81
  response = self.api_client.request(
80
82
  method=method,
@@ -17,12 +17,13 @@ class CatalogReader(BaseReader):
17
17
  """Initializes the CatalogReader object."""
18
18
  super().__init__()
19
19
 
20
- def read(self, table_identifier: str = "", **kwargs: Any) -> DataFrame:
20
+ def read(self, table_identifier: str = "", *, options: dict[str, str] | None = None, **kwargs: Any) -> DataFrame:
21
21
  """Reads a table from the Unity Catalog.
22
22
 
23
23
  Args:
24
24
  table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
25
- **kwargs: This method does not accept any additional keyword arguments.
25
+ options: PySpark options for the read table operation (not used in the current implementation).
26
+ **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
26
27
 
27
28
  Returns:
28
29
  The Spark DataFrame containing the read data.
@@ -31,6 +32,8 @@ class CatalogReader(BaseReader):
31
32
  ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
32
33
  Exception: For any other unexpected errors.
33
34
  """
35
+ if options is None:
36
+ options = {}
34
37
  if not table_identifier:
35
38
  raise ValueError("table_identifier is required")
36
39
  if not isinstance(table_identifier, str):
@@ -39,7 +42,7 @@ class CatalogReader(BaseReader):
39
42
  raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
40
43
 
41
44
  try:
42
- df = self._spark.read.table(table_identifier)
45
+ df = self._spark.read.table(table_identifier, **options)
43
46
  return df
44
47
  except AnalysisException as err:
45
48
  raise ValueError(f"Table not found: {table_identifier}") from err
@@ -92,7 +92,7 @@ class ExcelDataFrameReader(BaseReader):
92
92
  pyspark.pandas.read_excel and handed to TextFileReader.
93
93
  load_as_strings: If True, converts all columns to string type to avoid datatype conversion errors in Spark.
94
94
  add_metadata_column: If True, adds a metadata column containing the file location and sheet name.
95
- kwargs: This method does not accept any additional keyword arguments.
95
+ **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
96
96
  """
97
97
  if options is None:
98
98
  options = {}
File without changes
@@ -1,7 +1,9 @@
1
1
  from typing import Any
2
2
 
3
3
  import pyspark.sql.functions as F
4
- from pyspark.sql import DataFrame
4
+ from pyspark.sql import DataFrame, DataFrameReader
5
+ from pyspark.sql.streaming import DataStreamReader
6
+ from pyspark.sql.types import StructType
5
7
 
6
8
  from ...file_utilities import get_file_paths
7
9
  from .reader import BaseReader
@@ -17,9 +19,18 @@ class FileReader(BaseReader):
17
19
  """Initializes the FileReader object."""
18
20
  super().__init__()
19
21
 
22
+ def _get_reader(self) -> DataFrameReader:
23
+ """Returns a DataFrameReader."""
24
+ return self._spark.read
25
+
26
+ def _get_stream_reader(self) -> DataStreamReader:
27
+ """Returns a DataFrameReader."""
28
+ return self._spark.readStream
29
+
20
30
  def read(
21
31
  self,
22
32
  location: str,
33
+ *,
23
34
  spark_format: str | None = None,
24
35
  extension: str | None = None,
25
36
  schema: str | None = None,
@@ -38,7 +49,22 @@ class FileReader(BaseReader):
38
49
  search_subdirs: Whether to include files in subdirectories.
39
50
  options: Spark DataFrame reader options.
40
51
  add_metadata_column: Whether to include __metadata column in the DataFrame.
41
- kwargs: This method does not accept any additional keyword arguments.
52
+ **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
53
+
54
+ Raises:
55
+ ValueError: If neither spark_format nor extension is provided.
56
+ ValueError: If the provided extension is not supported.
57
+ Exception: If there is an error while reading the files.
58
+
59
+ Note:
60
+ - The `spark_format` parameter is used to specify the format of the files to be read.
61
+ - If `spark_format` is not provided, the method will try to infer it from the `extension`.
62
+ - The `extension` parameter is used to specify the file extension (e.g., 'csv', 'json', etc.).
63
+ - If both `spark_format` and `extension` are provided, `spark_format` will take precedence.
64
+ - The method will raise an error if neither `spark_format` nor `extension` is provided.
65
+
66
+ Returns:
67
+ A DataFrame containing the data from the files.
42
68
  """
43
69
  if options is None:
44
70
  options = {}
@@ -67,7 +93,7 @@ class FileReader(BaseReader):
67
93
  self._console_logger.debug(f"File paths: {file_paths}")
68
94
  assert spark_format is not None
69
95
 
70
- reader = self._spark.read.format(spark_format)
96
+ reader = self._get_reader().format(spark_format)
71
97
  if schema:
72
98
  reader.schema(schema)
73
99
  else:
@@ -78,7 +104,7 @@ class FileReader(BaseReader):
78
104
 
79
105
  try:
80
106
  self._console_logger.debug("Loading files into DataFrame")
81
- df = reader.load(file_paths)
107
+ df = reader.load([str(p) for p in file_paths])
82
108
  self._console_logger.debug("Successfully loaded files into DataFrame")
83
109
  if add_metadata_column:
84
110
  df = self._add_metadata_column(df)
@@ -89,9 +115,56 @@ class FileReader(BaseReader):
89
115
  self._console_logger.info(f"Successfully read files from [ '{location}' ]")
90
116
  return df
91
117
 
118
+ def read_stream(
119
+ self,
120
+ location: str = "",
121
+ schema: StructType | str | None = None,
122
+ format: str = "delta",
123
+ add_metadata_column: bool = False,
124
+ options: dict[str, Any] | None = None,
125
+ **_: Any,
126
+ ) -> DataFrame:
127
+ """Reads specified location as a stream and returns streaming DataFrame.
128
+
129
+ Arguments:
130
+ location : Location of files to read.
131
+ format: Format of files to read.
132
+ schema: Schema of the file.
133
+ add_metadata_column: Whether to include __metadata column in the DataFrame.
134
+ options: Spark DataFrame reader options.
135
+
136
+ Raises:
137
+ ValueError: If location is not provided.
138
+
139
+ Returns:
140
+ A Streaming DataFrame
141
+ """
142
+ if not location:
143
+ raise ValueError("Location is required for streaming.")
144
+ self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
145
+ try:
146
+ if options is None:
147
+ options = {}
148
+ reader = self._get_stream_reader()
149
+ reader.format(format)
150
+ reader.option("rescuedDataColumn", "_rescued_data")
151
+ if schema is None:
152
+ options["inferSchema"] = True
153
+ else:
154
+ reader.schema(schema)
155
+ reader.options(**options)
156
+ df = reader.load(location)
157
+ if add_metadata_column:
158
+ df = self._add_metadata_column(df)
159
+ except Exception as e:
160
+ self._console_logger.error(f"Failed to read files from [ '{location}' ]: {e}")
161
+ raise
162
+ else:
163
+ self._console_logger.info(f"Successfully read files from [ '{location}' ]")
164
+ return df
165
+
92
166
  def _add_metadata_column(self, df: DataFrame) -> DataFrame:
93
167
  """Add all metadata columns to the DataFrame."""
94
- # Extract metadata fields into separate columns
95
168
  metadata_columns = df.select("_metadata.*").columns
96
169
 
97
170
  entries = [(F.lit(field), F.col(f"_metadata.{field}")) for field in metadata_columns]
File without changes
@@ -1,3 +1,10 @@
1
1
  from .catalog_writer import CatalogWriter
2
+ from .delta_writer import DeltaAppendWriter, DeltaMergeWriter
3
+ from .file_writer import FileWriter
2
4
 
3
- __all__ = ["CatalogWriter"]
5
+ __all__ = [
6
+ "CatalogWriter",
7
+ "DeltaAppendWriter",
8
+ "DeltaMergeWriter",
9
+ "FileWriter",
10
+ ]
File without changes
@@ -0,0 +1,7 @@
1
+ from .delta_append_writer import DeltaAppendWriter
2
+ from .delta_merge_writer import DeltaMergeWriter
3
+
4
+ __all__ = [
5
+ "DeltaAppendWriter",
6
+ "DeltaMergeWriter",
7
+ ]
@@ -0,0 +1,108 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+ from ....object_manager import table_log_decorator
4
+ from ....session import SessionManager
5
+ from ..file_writer import FileWriter
6
+ from .delta_table_operation_type import DeltaTableOperationType
7
+ from .delta_writer_base import BaseDeltaWriter
8
+
9
+
10
+ class DeltaAppendWriter(BaseDeltaWriter):
11
+ """A class for appending DataFrames to Delta tables."""
12
+
13
+ def __init__(self):
14
+ super().__init__()
15
+ self._spark = SessionManager.get_spark_session()
16
+ self._dbutils = SessionManager.get_utils()
17
+
18
+ @table_log_decorator(operation="append")
19
+ def write(
20
+ self,
21
+ table_identifier: str,
22
+ table_location: str,
23
+ data_frame: DataFrame,
24
+ ignore_empty_df: bool = False,
25
+ options: dict[str, str] | None = None,
26
+ ):
27
+ """Appends the provided DataFrame to a Delta table.
28
+
29
+ Args:
30
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
31
+ table_location: The location of the Delta table.
32
+ data_frame: The DataFrame to append to the table.
33
+ ignore_empty_df: If True, the function returns early without
34
+ doing anything if the DataFrame is empty.
35
+ options: Additional keyword arguments that will be passed to the 'write' method of the
36
+ FileDataFrameWriter instance. These can be any parameters accepted by the 'write'
37
+ method, which could include options for configuring the write operation, such as
38
+ 'checkpointLocation' for specifying the path where checkpoints will be stored, or
39
+ 'path' for specifying the path where the output data will be written.
40
+ """
41
+ if self._empty_dataframe_check(data_frame, ignore_empty_df):
42
+ return
43
+ writer = FileWriter()
44
+ writer.write(
45
+ data_frame=data_frame,
46
+ location=table_location,
47
+ format="DELTA",
48
+ mode="APPEND",
49
+ options=options,
50
+ )
51
+ self._report_delta_table_operation_metrics(
52
+ table_identifier=table_identifier, operation_type=DeltaTableOperationType.WRITE
53
+ )
54
+
55
+ @table_log_decorator(operation="stream_append")
56
+ def write_stream(
57
+ self,
58
+ table_identifier: str,
59
+ table_location: str,
60
+ data_frame: DataFrame,
61
+ checkpoint_location: str | None = None,
62
+ trigger_dict: dict | None = None,
63
+ options: dict[str, str] | None = None,
64
+ await_termination: bool = False,
65
+ ):
66
+ """Appends the provided DataFrame to a Delta table.
67
+
68
+ Args:
69
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
70
+ table_location: The location of the Delta table.
71
+ data_frame: The DataFrame to append to the table.
72
+ checkpoint_location: Location of checkpoint. If None, defaults
73
+ to the location of the table being written, with '_checkpoint_'
74
+ added before name. Default None.
75
+ trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
76
+ Supported keys include:
77
+
78
+ - "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
79
+ - "once": Processes all available data once and then stops.
80
+ - "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
81
+ - "availableNow": Processes all available data immediately and then stops.
82
+
83
+ If nothing is provided, the default is {"availableNow": True}.
84
+ options: Additional keyword arguments that will be passed to the
85
+ 'write' method of the FileDataFrameWriter instance. These can be
86
+ any parameters accepted by the 'write' method, which could
87
+ include options for configuring the write operation.
88
+ await_termination: If True, the function will wait for the streaming
89
+ query to finish before returning. This is useful for ensuring that
90
+ the data has been fully written before proceeding with other
91
+ operations.
92
+
93
+ Returns:
94
+ None.
95
+ """
96
+ writer = FileWriter()
97
+ writer.write_stream(
98
+ data_frame=data_frame,
99
+ location=table_location,
100
+ format="DELTA",
101
+ checkpoint_location=checkpoint_location,
102
+ mode="APPEND",
103
+ trigger_dict=trigger_dict,
104
+ options=options,
105
+ )
106
+ self._report_delta_table_operation_metrics(
107
+ table_identifier=table_identifier, operation_type=DeltaTableOperationType.WRITE
108
+ )
@@ -0,0 +1,215 @@
1
+ from typing import Any, Self
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
4
+ from pyspark.sql import DataFrame
5
+
6
+ from ....models import Table
7
+ from ....object_manager import table_log_decorator
8
+ from ....session import SessionManager
9
+ from .delta_table_operation_type import DeltaTableOperationType
10
+ from .delta_writer_base import BaseDeltaWriter
11
+
12
+
13
+ class DeltaMergeConfig(BaseModel):
14
+ """Configuration for Merge options.
15
+
16
+ Args:
17
+ dataframe_columns: The columns of the DataFrame.
18
+ key_columns: List of column names that form the key for the merge
19
+ operation.
20
+ when_matched_update: Flag to specify whether to perform an update
21
+ operation when matching records are found in the target Delta table.
22
+ when_matched_delete: Flag to specify whether to perform a delete
23
+ operation when matching records are found in the target Delta table.
24
+ when_not_matched_insert: Flag to specify whether to perform an insert
25
+ operation when matching records are not found in the target Delta
26
+ table.
27
+ cols_to_exclude_from_update: List of column names to be excluded from
28
+ the update in the target Delta table.
29
+ use_partition_pruning: Flag to specify whether to use partition
30
+ pruning to optimize the performance of the merge operation.
31
+ partition_by: List of column names to partition by.
32
+ """
33
+
34
+ dataframe_columns: list[str]
35
+ key_columns: list[str]
36
+ cols_to_exclude_from_update: list[str] = Field(default_factory=list)
37
+ when_matched_update: bool = True
38
+ when_matched_delete: bool = False
39
+ when_not_matched_insert: bool = True
40
+ use_partition_pruning: bool = True
41
+ partition_by: list[str] = Field(default_factory=list)
42
+ cols_to_merge: list[str] = Field(default_factory=list, alias="_cols_to_merge")
43
+ cols_to_update: set[str] = Field(default_factory=set, alias="_cols_to_update")
44
+ cols_to_insert: set[str] = Field(default_factory=set, alias="_cols_to_insert")
45
+ final_cols_to_update: dict[str, str] = Field(default_factory=dict)
46
+ final_cols_to_insert: dict[str, str] = Field(default_factory=dict)
47
+
48
+ model_config = ConfigDict(arbitrary_types_allowed=True)
49
+
50
+ @model_validator(mode="before")
51
+ @classmethod
52
+ def _validate_update_delete(cls, config: Any):
53
+ """Update and delete operations must be mutually exclusive."""
54
+ if config.get("when_matched_update") and config.get("when_matched_delete"):
55
+ raise ValueError("Update and delete operations cannot be used together.")
56
+ return config
57
+
58
+ @model_validator(mode="before")
59
+ @classmethod
60
+ def _validate_key_columns(cls, config: Any):
61
+ """Key columns must exist in the data frame."""
62
+ key_columns = config.get("key_columns")
63
+ dataframe_columns = config.get("dataframe_columns")
64
+ if not set(key_columns).issubset(set(dataframe_columns)):
65
+ raise ValueError("Key columns must exist in the DataFrame columns.")
66
+ return config
67
+
68
+ @model_validator(mode="before")
69
+ @classmethod
70
+ def _derive_merge_columns(cls, config: Any):
71
+ """Derive update and insert columns from the DataFrame columns."""
72
+ dataframe_columns = config.get("dataframe_columns", [])
73
+ config["_cols_to_merge"] = list(set(dataframe_columns))
74
+ if config.get("cols_to_exclude_from_update"):
75
+ config["_cols_to_update"] = set(config["_cols_to_merge"]) - set(config["cols_to_exclude_from_update"])
76
+ else:
77
+ config["_cols_to_update"] = set(config["_cols_to_merge"])
78
+
79
+ config["_cols_to_insert"] = config["_cols_to_merge"]
80
+ config["final_cols_to_update"] = {col: f"source.{col}" for col in config["_cols_to_update"]}
81
+ config["final_cols_to_insert"] = {col: f"source.{col}" for col in config["_cols_to_insert"]}
82
+ return config
83
+
84
+ @model_validator(mode="after")
85
+ @classmethod
86
+ def _validate_partition_pruning(cls, config: Self):
87
+ """If partition_pruning is set, the partition by columns must be known."""
88
+ if config.use_partition_pruning is True and not config.partition_by:
89
+ raise ValueError("Partition columns must be specified when using partition pruning.")
90
+ return config
91
+
92
+ @model_validator(mode="after")
93
+ @classmethod
94
+ def _validate_cols_exist(cls, config: Any):
95
+ """If partition_pruning is set, the partition by columns must be known."""
96
+ if any(col not in config.cols_to_merge for col in config.cols_to_update) or any(
97
+ col not in config.cols_to_merge for col in config.cols_to_insert
98
+ ):
99
+ raise ValueError(
100
+ "You specified column names for UPDATE or INSERT that either don't exist in the dataframe "
101
+ "or are explicitly excluded from the MERGE.",
102
+ )
103
+ return config
104
+
105
+
106
+ class DeltaMergeWriter(BaseDeltaWriter):
107
+ """A class for merging DataFrames to Delta tables."""
108
+
109
+ def __init__(self):
110
+ super().__init__()
111
+ self._spark = SessionManager.get_spark_session()
112
+ self._dbutils = SessionManager.get_utils()
113
+
114
+ def _validate_table_inputs(
115
+ self, table: Table | None, table_identifier: str | None, storage_path: str | None
116
+ ) -> tuple[str, str]:
117
+ """Validates and retrieves table identifier and storage path."""
118
+ if table is None and (table_identifier is None or storage_path is None):
119
+ raise ValueError("Either a Table object or table_identifier and storage_path must be provided.")
120
+ if table is not None:
121
+ table_identifier = table.identifier
122
+ storage_path = str(table.storage_path)
123
+ if not storage_path:
124
+ raise ValueError("Storage path must be provided or extracted from the Table object.")
125
+ assert table_identifier is not None, "Table identifier must be provided."
126
+ return table_identifier, storage_path
127
+
128
+ def _build_match_conditions(self, data_frame: DataFrame, config: DeltaMergeConfig) -> str:
129
+ """Builds match conditions for the Delta table merge."""
130
+ match_conditions = self._merge_match_conditions(config.key_columns)
131
+ if config.use_partition_pruning:
132
+ match_conditions_list = [match_conditions] + [
133
+ self._partition_pruning_conditions(data_frame, config.partition_by),
134
+ ]
135
+ match_conditions = " AND ".join(match_conditions_list)
136
+ return match_conditions
137
+
138
+ def _build_merge_operations(
139
+ self, delta_table, data_frame: DataFrame, config: DeltaMergeConfig, match_conditions: str
140
+ ):
141
+ """Builds the Delta table merge operations."""
142
+ delta_table_merge = delta_table.alias("target").merge(
143
+ source=data_frame.alias("source"),
144
+ condition=match_conditions,
145
+ )
146
+ if config.when_matched_update:
147
+ delta_table_merge = delta_table_merge.whenMatchedUpdate(set=config.final_cols_to_update)
148
+ elif config.when_matched_delete:
149
+ delta_table_merge = delta_table_merge.whenMatchedDelete()
150
+ if config.when_not_matched_insert:
151
+ delta_table_merge = delta_table_merge.whenNotMatchedInsert(values=config.final_cols_to_insert)
152
+ return delta_table_merge
153
+
154
+ @table_log_decorator(operation="merge")
155
+ def write(
156
+ self,
157
+ data_frame: DataFrame,
158
+ table: Table | None = None,
159
+ table_identifier: str | None = None,
160
+ storage_path: str | None = None,
161
+ ignore_empty_df: bool = False,
162
+ **kwargs: Any,
163
+ ):
164
+ """Merges the data in a spark DataFrame into a Delta table.
165
+
166
+ This function performs a merge operation between a DataFrame and a Delta
167
+ table. The function supports update, delete, and insert operations on
168
+ the target Delta table based on conditions specified by the user. The
169
+ function also supports partition pruning to optimize the performance of
170
+ the merge operation.
171
+
172
+ Args:
173
+ table: The Table object representing the Delta table.
174
+ table_identifier: The identifier of the Delta table in the format
175
+ 'catalog.schema.table'.
176
+ storage_path: The location of the Delta table.
177
+ data_frame: The DataFrame to be merged into the Delta table.
178
+ ignore_empty_df: A flag indicating whether to ignore an empty source
179
+ dataframe.
180
+ kwargs: Passed to the
181
+ [`DeltaMergeConfig`][cloe_nessy.integration.writer.delta_merge_writer.DeltaMergeConfig].
182
+
183
+ Raises:
184
+ ValueError: If both, table and table_identifier or storage_path are provided.
185
+ EmptyDataframeException: If the source dataframe is empty and
186
+ ignore_empty_df is False.
187
+ ValueError: If the specified columns for update or insert do not
188
+ exist in the DataFrame or are explicitly excluded from the
189
+ merge operation.
190
+ ValueError: If partition columns are not specified when using
191
+ partition pruning.
192
+ """
193
+ if self._empty_dataframe_check(data_frame, ignore_empty_df):
194
+ return
195
+ table_identifier, storage_path = self._validate_table_inputs(table, table_identifier, storage_path)
196
+
197
+ config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
198
+
199
+ delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
200
+
201
+ match_conditions = self._build_match_conditions(data_frame, config)
202
+
203
+ delta_table_merge = self._build_merge_operations(delta_table, data_frame, config, match_conditions)
204
+ delta_table_merge.execute()
205
+ self._report_delta_table_operation_metrics(
206
+ table_identifier,
207
+ operation_type=DeltaTableOperationType.MERGE,
208
+ )
209
+
210
+ @table_log_decorator(operation="stream_merge")
211
+ def write_stream(self):
212
+ """Not implemented yet. See docs for more details."""
213
+ raise NotImplementedError(
214
+ "Streaming merge is not implemented yet. Please use the `write` method for batch merges."
215
+ )
@@ -0,0 +1,21 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DeltaTableOperationType(Enum):
5
+ """Mapping between Delta table operation types and their operation metric keys available in the Delta table history.
6
+
7
+ Values of metric keys included in this mapping are reported using the
8
+ logging capabilities of the Delta operations of the DeltaManager.
9
+
10
+ See https://docs.databricks.com/delta/history.html for a complete list and
11
+ description of available metrics for each operation type.
12
+ """
13
+
14
+ UPDATE = ["numUpdatedRows"]
15
+ DELETE = ["numDeletedRows", "numRemovedFiles"]
16
+ MERGE = ["numSourceRows", "numTargetRowsInserted", "numTargetRowsUpdated", "numTargetRowsDeleted", "numOutputRows"]
17
+ WRITE = ["numOutputRows"]
18
+ TRUNCATE = ["numRemovedFiles"]
19
+ OPTIMIZE = ["numAddedFiles", "numRemovedFiles", "minFileSize", "p50FileSize", "maxFileSize"]
20
+ VACUUM = ["numDeletedFiles"]
21
+ STREAMING_UPDATE = ["numRemovedFiles", "numOutputRows", "numOutputBytes", "numAddedFiles"]