cloe-nessy 0.3.5__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +0 -0
  3. cloe_nessy/clients/api_client/__init__.py +0 -0
  4. cloe_nessy/clients/api_client/api_client.py +0 -0
  5. cloe_nessy/clients/api_client/api_response.py +0 -0
  6. cloe_nessy/clients/api_client/auth.py +0 -0
  7. cloe_nessy/clients/api_client/exceptions.py +0 -0
  8. cloe_nessy/file_utilities/__init__.py +0 -0
  9. cloe_nessy/file_utilities/exceptions.py +0 -0
  10. cloe_nessy/file_utilities/factory.py +0 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +0 -0
  12. cloe_nessy/file_utilities/location_types.py +0 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +0 -0
  20. cloe_nessy/integration/reader/api_reader.py +4 -2
  21. cloe_nessy/integration/reader/catalog_reader.py +6 -3
  22. cloe_nessy/integration/reader/excel_reader.py +1 -1
  23. cloe_nessy/integration/reader/exceptions.py +0 -0
  24. cloe_nessy/integration/reader/file_reader.py +78 -5
  25. cloe_nessy/integration/reader/reader.py +0 -0
  26. cloe_nessy/integration/writer/__init__.py +8 -1
  27. cloe_nessy/integration/writer/catalog_writer.py +0 -0
  28. cloe_nessy/integration/writer/delta_writer/__init__.py +7 -0
  29. cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +108 -0
  30. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +215 -0
  31. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +21 -0
  32. cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +210 -0
  33. cloe_nessy/integration/writer/delta_writer/exceptions.py +4 -0
  34. cloe_nessy/integration/writer/file_writer.py +132 -0
  35. cloe_nessy/integration/writer/writer.py +54 -0
  36. cloe_nessy/logging/__init__.py +0 -0
  37. cloe_nessy/logging/logger_mixin.py +0 -0
  38. cloe_nessy/models/__init__.py +4 -0
  39. cloe_nessy/models/adapter/__init__.py +3 -0
  40. cloe_nessy/models/adapter/unity_catalog_adapter.py +296 -0
  41. cloe_nessy/models/catalog.py +10 -0
  42. cloe_nessy/models/column.py +0 -0
  43. cloe_nessy/models/constraint.py +0 -0
  44. cloe_nessy/models/foreign_key.py +0 -0
  45. cloe_nessy/models/mixins/__init__.py +0 -0
  46. cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
  47. cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
  48. cloe_nessy/models/schema.py +20 -1
  49. cloe_nessy/models/table.py +67 -11
  50. cloe_nessy/models/types.py +0 -0
  51. cloe_nessy/models/volume.py +67 -0
  52. cloe_nessy/object_manager/__init__.py +7 -2
  53. cloe_nessy/object_manager/table_manager.py +251 -21
  54. cloe_nessy/object_manager/volume_manager.py +70 -0
  55. cloe_nessy/pipeline/__init__.py +0 -0
  56. cloe_nessy/pipeline/actions/__init__.py +9 -1
  57. cloe_nessy/pipeline/actions/read_api.py +0 -0
  58. cloe_nessy/pipeline/actions/read_catalog_table.py +1 -4
  59. cloe_nessy/pipeline/actions/read_excel.py +0 -0
  60. cloe_nessy/pipeline/actions/read_files.py +0 -0
  61. cloe_nessy/pipeline/actions/read_metadata_yaml.py +0 -0
  62. cloe_nessy/pipeline/actions/transform_change_datatype.py +0 -0
  63. cloe_nessy/pipeline/actions/transform_clean_column_names.py +0 -0
  64. cloe_nessy/pipeline/actions/transform_concat_columns.py +0 -0
  65. cloe_nessy/pipeline/actions/transform_decode.py +0 -0
  66. cloe_nessy/pipeline/actions/transform_deduplication.py +0 -0
  67. cloe_nessy/pipeline/actions/transform_distinct.py +0 -0
  68. cloe_nessy/pipeline/actions/transform_filter.py +0 -0
  69. cloe_nessy/pipeline/actions/transform_generic_sql.py +0 -0
  70. cloe_nessy/pipeline/actions/transform_group_aggregate.py +0 -0
  71. cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
  72. cloe_nessy/pipeline/actions/transform_join.py +0 -0
  73. cloe_nessy/pipeline/actions/transform_json_normalize.py +0 -0
  74. cloe_nessy/pipeline/actions/transform_rename_columns.py +0 -0
  75. cloe_nessy/pipeline/actions/transform_replace_values.py +0 -0
  76. cloe_nessy/pipeline/actions/transform_select_columns.py +0 -0
  77. cloe_nessy/pipeline/actions/transform_union.py +0 -0
  78. cloe_nessy/pipeline/actions/write_catalog_table.py +0 -0
  79. cloe_nessy/pipeline/actions/write_delta_append.py +69 -0
  80. cloe_nessy/pipeline/actions/write_delta_merge.py +118 -0
  81. cloe_nessy/pipeline/actions/write_file.py +94 -0
  82. cloe_nessy/pipeline/pipeline.py +44 -2
  83. cloe_nessy/pipeline/pipeline_action.py +0 -0
  84. cloe_nessy/pipeline/pipeline_config.py +0 -0
  85. cloe_nessy/pipeline/pipeline_context.py +0 -0
  86. cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
  87. cloe_nessy/pipeline/pipeline_step.py +0 -0
  88. cloe_nessy/py.typed +0 -0
  89. cloe_nessy/session/__init__.py +0 -0
  90. cloe_nessy/session/session_manager.py +27 -0
  91. cloe_nessy/settings/__init__.py +0 -0
  92. cloe_nessy/settings/settings.py +0 -0
  93. cloe_nessy/utils/__init__.py +0 -0
  94. cloe_nessy/utils/file_and_directory_handler.py +0 -0
  95. cloe_nessy-0.3.9.dist-info/METADATA +70 -0
  96. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/RECORD +35 -18
  97. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/WHEEL +1 -1
  98. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/top_level.txt +0 -0
  99. cloe_nessy-0.3.5.dist-info/METADATA +0 -26
@@ -1,58 +1,288 @@
1
+ import functools
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+
5
+ from delta import DeltaTable # type: ignore
6
+
1
7
  from ..logging import LoggerMixin
8
+ from ..models import Table
2
9
  from ..session import SessionManager
3
10
 
4
11
 
12
+ @dataclass
13
+ class TableManagerLogs:
14
+ """Dataclass defining the table manager logs table."""
15
+
16
+ logger_name = "Tabular:TableManager"
17
+ log_type: str = "nessy_simple_logs"
18
+ uc_table_name: str = "nessy_simple_logs"
19
+ uc_table_columns: dict[str, str] = field(
20
+ default_factory=lambda: {
21
+ "message": "STRING",
22
+ }
23
+ )
24
+
25
+
26
+ def table_log_decorator(operation: str):
27
+ """Creates a decorator that logs the start, failure (if any), and completion of a table operation.
28
+
29
+ The created decorator wraps a function that performs an operation on a table. The decorator logs
30
+ the start of the operation, calls the original function, logs if there was an exception, and logs
31
+ the completion of the operation. Functions that are wrapped must support the self._table_logger
32
+ attribute.
33
+
34
+ Args:
35
+ operation: The name of the operation to be logged. This will be included in the log messages.
36
+
37
+ Returns:
38
+ inner_decorator: A decorator that can be used to wrap a function that performs an operation on a table.
39
+
40
+ Example:
41
+ ```python
42
+ @table_log_decorator(operation='delete_physical_data_for_table')
43
+ def _delete_physical_data(self, table_identifier: str):
44
+ self._dbutils.fs.rm(table_location, recurse=True)
45
+ ```
46
+ """
47
+
48
+ def inner_decorator(func):
49
+ @functools.wraps(func)
50
+ def wrapper(self, *args, **kwargs):
51
+ table_identifier = kwargs.get("table_identifier") or kwargs.get("table") or args[0]
52
+ if isinstance(table_identifier, Table):
53
+ table_identifier = table_identifier.identifier
54
+ self._tabular_logger.info(
55
+ "operation:%s | identifier:%s | status:start | error:''",
56
+ operation,
57
+ table_identifier,
58
+ )
59
+ try:
60
+ func(self, *args, **kwargs)
61
+ except Exception as e:
62
+ self._tabular_logger.error(
63
+ "operation:%s | identifier:%s | status:failed | error:%s",
64
+ operation,
65
+ table_identifier,
66
+ e,
67
+ )
68
+ raise e
69
+ else:
70
+ self._tabular_logger.info(
71
+ "operation:%s | identifier:%s | status:completed | error:''",
72
+ operation,
73
+ table_identifier,
74
+ )
75
+
76
+ return wrapper
77
+
78
+ return inner_decorator
79
+
80
+
5
81
  class TableManager(LoggerMixin):
6
- """TableManager class for managing tables in the catalog."""
82
+ """TableManager class for managing tables."""
7
83
 
8
- def __init__(self):
84
+ def __init__(self, tabular_logger: logging.Logger | None = None):
9
85
  self._spark = SessionManager.get_spark_session()
10
- self._utils = SessionManager.get_utils()
11
86
  self._console_logger = self.get_console_logger()
12
87
  self._console_logger.debug("TableManager initialized...")
13
- self._tabular_logger = self.get_tabular_logger(uc_table_name="TableManager")
88
+ self._tabular_logger = tabular_logger or self.get_tabular_logger(**TableManagerLogs().__dict__)
14
89
  self._tabular_logger.debug("message:TableManager initialized.")
15
90
 
16
- @staticmethod
17
- def create_table():
18
- """Create a table in the catalog."""
19
- raise NotImplementedError
91
+ @table_log_decorator(operation="create")
92
+ def create_table(
93
+ self,
94
+ table: Table,
95
+ ignore_if_exists: bool = False,
96
+ replace: bool = False,
97
+ ) -> None:
98
+ """Creates a Table in the catalog.
20
99
 
21
- def drop_table(self, table_identifier: str, delete_physical_data: bool = False):
100
+ Args:
101
+ table: A Table object representing the Delta table.
102
+ ignore_if_exists: If set to True, the function will return early
103
+ without doing anything if the table already exists.
104
+ replace: If set to True, the function will replace the table if it
105
+ already exists.
106
+ """
107
+ if ignore_if_exists and self.table_exists(table):
108
+ return
109
+ self._console_logger.info(f"Creating table: {table.identifier}")
110
+ self._spark.sql(f"USE CATALOG {table.catalog};")
111
+ self._spark.sql(f"USE SCHEMA {table.schema};")
112
+ for statement in table.get_create_statement(replace=replace).split(";"):
113
+ if statement and statement != "\n":
114
+ self._spark.sql(statement)
115
+
116
+ def drop_table(
117
+ self,
118
+ table: Table | None = None,
119
+ storage_location: str | None = None,
120
+ table_identifier: str | None = None,
121
+ delete_physical_data: bool = False,
122
+ ):
22
123
  """Deletes a Table. For security reasons you are forced to pass the table_name.
23
124
 
24
125
  If delete_physical_data is True the actual physical data on the ADLS will be deleted.
25
126
  Use with caution!
26
127
 
27
128
  Args:
129
+ table: The Table object representing the Delta table.
130
+ storage_location: The location of the Delta table on the ADLS.
28
131
  table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
29
132
  delete_physical_data: If set to True, deletes not only the metadata
30
133
  within the Catalog but also the physical data.
134
+
135
+ Raises:
136
+ ValueError: If neither table nor table_identifier is provided, or if both are provided.
137
+ ValueError: If the table storage path is not provided by the table object.
31
138
  """
32
139
  self._console_logger.info(f"Deleting table [ '{table_identifier}' ] ...")
33
- if not isinstance(table_identifier, str):
34
- raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
35
-
140
+ if table is not None and (table_identifier is not None or storage_location is not None):
141
+ raise ValueError("Either table or table_identifier and storage_location must be provided, but not both.")
142
+ if table is not None:
143
+ table_identifier = table.identifier
144
+ storage_location = str(table.storage_path)
36
145
  if delete_physical_data:
37
- self._delete_physical_data()
38
- self.drop_table_from_catalog(table_identifier)
146
+ self._delete_physical_data(location=storage_location)
147
+ self.drop_table_from_catalog(table_identifier=table_identifier)
39
148
 
40
- def drop_table_from_catalog(self, table_identifier: str) -> None:
149
+ def drop_table_from_catalog(self, table_identifier: str | None = None, table: Table | None = None) -> None:
41
150
  """Removes a table from the catalog. Physical data is retained.
42
151
 
43
152
  Args:
44
153
  table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
154
+ table: The Table object representing the Delta table.
155
+
156
+ Raises:
157
+ ValueError: If neither table nor table_identifier is provided, or if both are provided.
45
158
  """
159
+ if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
160
+ raise ValueError("Either table or table_identifier must be provided, but not both.")
161
+ if table is not None:
162
+ table_identifier = table.identifier
46
163
  self._console_logger.info(f"... deleting table [ '{table_identifier}' ] from Catalog.")
47
- if not isinstance(table_identifier, str):
48
- raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
49
164
  self._spark.sql(f"DROP TABLE IF EXISTS {table_identifier};")
50
165
 
51
- def _delete_physical_data(self):
166
+ def _delete_physical_data(self, table: Table | None = None, location: str | None = None):
52
167
  """Removes the physical data on the ADLS for the location of this table.
53
168
 
169
+ Args:
170
+ table: The Table object representing the Delta table to be deleted.
171
+ location: The location of the Delta table to be deleted.
172
+
54
173
  Raises:
55
- NotImplementedError: This can be implemented, once a Table object is available.
174
+ ValueError: If neither table nor location is provided, or if both are provided.
175
+ ValueError: If the table storage path is not provided by the table object.
56
176
  """
57
- self._console_logger.info("... deleting physical data for table [ '' ] from Catalog.")
58
- raise NotImplementedError("This can be implemented, once a Table object is available.")
177
+ if (table is None and location is None) or (table is not None and location is not None):
178
+ raise ValueError("Either table or location must be provided, but not both.")
179
+ if table is not None:
180
+ if table.storage_path is None:
181
+ raise ValueError("Table storage path must be provided.")
182
+ location = str(table.storage_path)
183
+ SessionManager.get_utils().fs.rm(location, recurse=True)
184
+ self._console_logger.info("... deleting physical data.")
185
+
186
+ def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
187
+ """Get the DeltaTable object from the Table objects location or a location string.
188
+
189
+ Args:
190
+ table: A Table object representing the Delta table.
191
+ location: A string representing the table location.
192
+ spark: An optional Spark session. If not provided, the current Spark session will be used.
193
+
194
+ Returns:
195
+ The DeltaTable object corresponding to the given Table object or location string.
196
+
197
+ Raises:
198
+ ValueError: If neither table nor location is provided, or if both are provided.
199
+ """
200
+ if (table is None and location is None) or (table is not None and location is not None):
201
+ raise ValueError("Either table or location must be provided, but not both.")
202
+
203
+ if table is not None:
204
+ location = str(table.storage_path)
205
+ self._console_logger.info(f"Getting DeltaTable object for location: {location}")
206
+ return DeltaTable.forPath(spark or self._spark, str(location))
207
+
208
+ def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
209
+ """Checks if a table exists in the catalog.
210
+
211
+ Args:
212
+ table: A Table object representing the Delta table.
213
+ table_identifier: A string representing the table identifier in the format 'catalog.schema.table'.
214
+
215
+ Returns:
216
+ True if the table exists, else False.
217
+
218
+ Raises:
219
+ ValueError: If neither table nor table_identifier is provided, or if both are provided.
220
+ ValueError: If the table_identifier is not in the format 'catalog.schema.table'.
221
+ """
222
+ if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
223
+ raise ValueError("Either table or table_identifier must be provided, but not both.")
224
+
225
+ if table is not None:
226
+ catalog = table.catalog
227
+ schema = table.schema
228
+ table_name = table.name
229
+ else:
230
+ assert table_identifier is not None, "table_identifier must be provided."
231
+ catalog, schema, table_name = table_identifier.split(".")
232
+ if not all([catalog, schema, table_name]):
233
+ raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
234
+
235
+ query_result = self._spark.sql(
236
+ f"""
237
+ SELECT 1 FROM {catalog}.information_schema.tables
238
+ WHERE table_name = '{table_name}'
239
+ AND table_schema = '{schema}'
240
+ LIMIT 1""",
241
+ )
242
+ result = query_result.count() > 0
243
+ self._console_logger.info(f"Table [ '{catalog}.{schema}.{table_name}' ] exists: {result}")
244
+ return result is True
245
+
246
+ @table_log_decorator(operation="refresh")
247
+ def refresh_table(self, table: Table | None = None, table_identifier: str | None = None):
248
+ """Refreshes the metadata of a Delta table.
249
+
250
+ Args:
251
+ table: A Table object representing the Delta table.
252
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
253
+
254
+ Raises:
255
+ ValueError: If neither table nor table_identifier is provided, or if both are provided.
256
+ """
257
+ if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
258
+ raise ValueError("Either table or table_identifier must be provided, but not both.")
259
+
260
+ if table is not None:
261
+ table_identifier = f"{table.catalog}.{table.schema}.{table.name}"
262
+
263
+ self._console_logger.info(f"Refreshing table: {table_identifier}")
264
+ self._spark.sql(f"REFRESH TABLE {table_identifier};")
265
+
266
+ @table_log_decorator(operation="truncate")
267
+ def truncate_table(
268
+ self,
269
+ table: Table | None = None,
270
+ table_identifier: str | None = None,
271
+ ):
272
+ """Truncates a table.
273
+
274
+ Args:
275
+ table: A Table object representing the Delta table.
276
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
277
+
278
+ Raises:
279
+ ValueError: If neither table nor table_identifier is provided, or if both are provided.
280
+ """
281
+ if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
282
+ raise ValueError("Either table or table_identifier must be provided, but not both.")
283
+
284
+ if table is not None:
285
+ table_identifier = table.escaped_identifier
286
+
287
+ self._console_logger.info(f"Truncating table: {table_identifier}")
288
+ self._spark.sql(f"TRUNCATE TABLE {table_identifier};")
@@ -0,0 +1,70 @@
1
+ import logging
2
+
3
+ from ..logging import LoggerMixin
4
+ from ..models import Volume
5
+ from ..session import SessionManager
6
+
7
+
8
+ class VolumeManager(LoggerMixin):
9
+ """VolumeManager class for managing volumes."""
10
+
11
+ def __init__(self, console_logger: logging.Logger | None = None):
12
+ self._spark = SessionManager.get_spark_session()
13
+ self._console_logger = console_logger or self.get_console_logger()
14
+
15
+ def create_volume(self, volume: Volume):
16
+ """Creates a Volume in the catalog.
17
+
18
+ Args:
19
+ volume: A Volume object representing the UC object.
20
+ """
21
+ self._console_logger.info(f"Creating volume: {volume.identifier}")
22
+ self._spark.sql(f"USE CATALOG {volume.catalog};")
23
+ self._spark.sql(f"USE SCHEMA {volume.schema_name};")
24
+ for statement in volume.get_create_statement().split(";"):
25
+ if statement and statement != "\n":
26
+ self._spark.sql(statement)
27
+
28
+ def drop_volume(self, volume: Volume, if_exists: bool = True):
29
+ """Delete the volume.
30
+
31
+ Args:
32
+ volume: The volume to be deleted.
33
+ if_exists: If False, an error will be raised if the volume does not exist.
34
+ """
35
+ self._console_logger.info(f"Deleting volume: [' {volume.identifier}' ]")
36
+ self._spark.sql(f"DROP VOLUME {'IF EXISTS' if if_exists else ''} {volume.escaped_identifier};")
37
+ self._console_logger.info(f"Volume [' {volume.identifier}' ] has been deleted.")
38
+
39
+ def volume_exists(self, volume: Volume | None = None, volume_identifier: str | None = None) -> bool:
40
+ """Check if the volume exists.
41
+
42
+ Args:
43
+ volume: The volume to check.
44
+ volume_identifier: The identifier of the volume to check.
45
+
46
+ Raises:
47
+ ValueError: If both volume and volume_identifier are provided.
48
+
49
+ Returns:
50
+ True if the volume exists, False otherwise.
51
+ """
52
+ if volume and volume_identifier:
53
+ raise ValueError("Only one of volume or volume_identifier should be provided.")
54
+ if volume:
55
+ volume_identifier = volume.identifier
56
+
57
+ assert volume_identifier is not None
58
+
59
+ if volume_identifier.count(".") != 2:
60
+ raise ValueError("The identifier must be in the format 'catalog.schema.volume_name'.")
61
+ catalog, volume_schema, table_name = volume_identifier.split(".")
62
+ query_result = self._spark.sql(
63
+ f"""
64
+ SELECT 1 FROM {catalog}.information_schema.volumes
65
+ WHERE volume_name = '{table_name}'
66
+ AND volume_schema = '{volume_schema}'
67
+ LIMIT 1""",
68
+ )
69
+ result = query_result.count() > 0
70
+ return result is True
File without changes
@@ -14,6 +14,7 @@ from .transform_distinct import TransformDistinctAction
14
14
  from .transform_filter import TransformFilterAction
15
15
  from .transform_generic_sql import TransformSqlAction
16
16
  from .transform_group_aggregate import TransformGroupAggregate
17
+ from .transform_hash_columns import TransformHashColumnsAction
17
18
  from .transform_join import TransformJoinAction
18
19
  from .transform_json_normalize import TransformJsonNormalize
19
20
  from .transform_rename_columns import TransformRenameColumnsAction
@@ -21,6 +22,9 @@ from .transform_replace_values import TransformReplaceValuesAction
21
22
  from .transform_select_columns import TransformSelectColumnsAction
22
23
  from .transform_union import TransformUnionAction
23
24
  from .write_catalog_table import WriteCatalogTableAction
25
+ from .write_delta_append import WriteDeltaAppendAction
26
+ from .write_delta_merge import WriteDeltaMergeAction
27
+ from .write_file import WriteFileAction
24
28
 
25
29
  # Get all subclasses of PipelineAction defined in this submodule
26
30
  pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
@@ -35,7 +39,6 @@ __all__ = [
35
39
  "ReadExcelAction",
36
40
  "ReadFilesAction",
37
41
  "ReadMetadataYAMLAction",
38
- "WriteCatalogTableAction",
39
42
  "PipelineActionType",
40
43
  "TransformFilterAction",
41
44
  "TransformUnionAction",
@@ -51,4 +54,9 @@ __all__ = [
51
54
  "TransformRenameColumnsAction",
52
55
  "TransformReplaceValuesAction",
53
56
  "TransformSelectColumnsAction",
57
+ "WriteCatalogTableAction",
58
+ "WriteDeltaAppendAction",
59
+ "WriteDeltaMergeAction",
60
+ "WriteFileAction",
61
+ "TransformHashColumnsAction",
54
62
  ]
File without changes
@@ -61,8 +61,5 @@ class ReadCatalogTableAction(PipelineAction):
61
61
  raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
62
62
 
63
63
  table_reader = CatalogReader()
64
- df = table_reader.read(
65
- table_identifier=table_identifier,
66
- **options,
67
- )
64
+ df = table_reader.read(table_identifier=table_identifier, options=options)
68
65
  return context.from_existing(data=df)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,209 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from pyspark.sql import functions as F
5
+
6
+ from ..pipeline_action import PipelineAction
7
+ from ..pipeline_context import PipelineContext
8
+
9
+ SUPPORTED_ALGORITHMS = {"hash", "md5", "sha1", "sha2", "xxhash64", "crc32"}
10
+ VALID_SHA2_BITS = {224, 256, 384, 512}
11
+
12
+
13
+ class HashSettings(BaseModel):
14
+ """Represents the settings for hashing columns.
15
+
16
+ Attributes:
17
+ columns: List of column names to hash.
18
+ algorithm: Hashing algorithm to use. Must be one of
19
+ "hash", "md5", "sha1", "sha2", "xxhash64", or "crc32".
20
+ bits: Bit length for the 'sha2' algorithm. Optional.
21
+ """
22
+
23
+ columns: list[str]
24
+ algorithm: str = Field(..., description="Hashing algorithm to use")
25
+ bits: int | None = Field(default=None, description="Only required for sha2")
26
+
27
+ @model_validator(mode="before")
28
+ def validate_all(cls, values):
29
+ """Validates the input values for a hashing operation before model instantiation.
30
+
31
+ This method performs the following checks:
32
+
33
+ 1. Ensures the specified hashing algorithm is supported.
34
+ 2. Validates that at least one column is provided and that the columns parameter is a non-empty list.
35
+ 3. Checks that hashing multiple columns is only supported for the 'hash' and 'xxhash64' algorithms.
36
+ 4. For the 'sha2' algorithm, ensures that the 'bits' parameter is one of the valid options.
37
+ 5. Ensures that the 'bits' parameter is not provided for algorithms other than 'sha2'.
38
+
39
+ Raises:
40
+ ValueError: If the algorithm is unsupported, no columns are provided, the columns parameter is invalid,
41
+ or the 'bits' parameter is invalid for the specified algorithm.
42
+ NotImplementedError: If multiple columns are provided and the algorithm does not support hashing multiple columns.
43
+
44
+ Args:
45
+ cls: The class being validated.
46
+ values: A dictionary of input values containing 'algorithm', 'columns', and 'bits'.
47
+
48
+ Returns:
49
+ The validated input values.
50
+ """
51
+ algorithm = values.get("algorithm")
52
+ columns = values.get("columns")
53
+ bits = values.get("bits")
54
+
55
+ if algorithm not in SUPPORTED_ALGORITHMS:
56
+ raise ValueError(
57
+ f"Unsupported hashing algorithm '{algorithm}'. Supported algorithms are: {SUPPORTED_ALGORITHMS}."
58
+ )
59
+
60
+ if not columns or not isinstance(columns, list) or len(columns) == 0:
61
+ raise ValueError("At least one column must be provided.")
62
+
63
+ if len(columns) > 1 and algorithm not in {"hash", "xxhash64"}:
64
+ raise NotImplementedError(
65
+ f"Hashing multiple columns is only supported for 'hash' and 'xxhash64'. Algorithm '{algorithm}' does not support this."
66
+ )
67
+
68
+ if algorithm == "sha2":
69
+ if bits not in VALID_SHA2_BITS:
70
+ raise ValueError(f"'bits' must be one of {VALID_SHA2_BITS} when using 'sha2'.")
71
+ elif bits is not None:
72
+ raise ValueError("'bits' is only allowed when algorithm is 'sha2'.")
73
+
74
+ return values
75
+
76
+
77
+ class HashConfig(BaseModel):
78
+ """A configuration model for defining hash settings for specific columns.
79
+
80
+ Attributes:
81
+ hash_config: A dictionary where the keys are column names
82
+ (as strings) and the values are `HashSettings` objects that define
83
+ the hash settings for each column.
84
+
85
+ Methods:
86
+ validate_config: Validates the hash configuration to ensure it contains
87
+ at least one entry and that all column names are valid strings. Raises a
88
+ `ValueError` if the configuration is invalid.
89
+ """
90
+
91
+ hash_config: dict[str, HashSettings]
92
+
93
+ @model_validator(mode="before")
94
+ def validate_config(cls, values):
95
+ """Validates the hash configuration provided in the model.
96
+
97
+ This method is executed in "before" mode to ensure that the `hash_config`
98
+ field in the input values meets the required criteria:
99
+
100
+ - It must be a dictionary.
101
+ - It must contain at least one entry.
102
+ - Each key in the dictionary must be a non-empty string.
103
+
104
+ Raises:
105
+ ValueError: If `hash_config` is missing, not a dictionary, empty, or
106
+ contains invalid column names.
107
+
108
+ Args:
109
+ cls: The class to which this validator is applied.
110
+ values: The input values to validate.
111
+
112
+ Returns:
113
+ The validated input values.
114
+ """
115
+ config = values.get("hash_config")
116
+ if not config or not isinstance(config, dict) or len(config) == 0:
117
+ raise ValueError("Hash configuration must contain at least one entry.")
118
+ for new_col in config:
119
+ if not new_col or not isinstance(new_col, str):
120
+ raise ValueError(f"Invalid column name '{new_col}' in hash configuration.")
121
+ return values
122
+
123
+
124
+ class TransformHashColumnsAction(PipelineAction):
125
+ """Hashes specified columns in a DataFrame using a chosen algorithm.
126
+
127
+ Given the following `hash_config`:
128
+
129
+ Example:
130
+ ```yaml
131
+ Hash Columns:
132
+ action: TRANSFORM_HASH_COLUMNS
133
+ options:
134
+ hash_config:
135
+ - hashed_column1:
136
+ columns: ["column1", "column2"]
137
+ algorithm: "sha2"
138
+ bits: 224
139
+ - hashed_column2:
140
+ columns: ["column1"]
141
+ algorithm: "crc32"
142
+ ```
143
+
144
+ Given a DataFrame `df` with the following structure:
145
+
146
+ | column1 | column2 | column3 |
147
+ |---------|---------|---------|
148
+ | foo | bar | baz |
149
+
150
+ After running the action, the resulting DataFrame will look like:
151
+
152
+ | column1 | column2 | column3 | hashed_column1 | hashed_column2 |
153
+ |---------|---------|---------|-----------------------------------------------------------|----------------|
154
+ | foo | bar | baz | 17725b837e9c896e7123b142eb980131dcc0baa6160db45d4adfdb21 | 1670361220 |
155
+
156
+
157
+ !!! note "Hash values might vary"
158
+ The actual hash values will depend on the hashing algorithm used and the input data.
159
+ """
160
+
161
+ name: str = "TRANSFORM_HASH_COLUMNS"
162
+
163
+ def run(
164
+ self,
165
+ context: PipelineContext,
166
+ *,
167
+ hash_config: HashConfig | None = None,
168
+ **_: Any,
169
+ ) -> PipelineContext:
170
+ """Hashes the specified columns in the DataFrame.
171
+
172
+ Args:
173
+ context: Context in which this Action is executed.
174
+ hash_config: Dictionary that contains the configuration for executing the hashing.
175
+
176
+ Returns:
177
+ Updated PipelineContext with hashed columns.
178
+
179
+ Raises:
180
+ ValueError: If columns are missing, data is None, or algorithm/bits are invalid.
181
+ ValueError: If the hash configuration is invalid.
182
+ """
183
+ if context.data is None:
184
+ raise ValueError("Context data is required for hashing.")
185
+
186
+ if not hash_config:
187
+ raise ValueError("Hash configuration is required.")
188
+
189
+ df = context.data
190
+
191
+ hash_functions = {
192
+ "hash": lambda cols: F.hash(*[F.col(c) for c in cols]).cast("string"),
193
+ "xxhash64": lambda cols: F.xxhash64(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
194
+ "md5": lambda cols: F.md5(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
195
+ "sha1": lambda cols: F.sha1(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
196
+ "sha2": lambda cols, bits: F.sha2(F.concat_ws("||", *[F.col(c) for c in cols]), bits).cast("string"),
197
+ "crc32": lambda cols: F.crc32(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
198
+ }
199
+ default_sha2_bits = 256
200
+
201
+ config_obj = HashConfig.model_validate({"hash_config": hash_config})
202
+ for new_col, config in config_obj.hash_config.items():
203
+ hash_func = hash_functions[config.algorithm]
204
+ if config.algorithm == "sha2":
205
+ df = df.withColumn(new_col, hash_func(config.columns, config.bits or default_sha2_bits)) # type: ignore
206
+ else:
207
+ df = df.withColumn(new_col, hash_func(config.columns)) # type: ignore
208
+
209
+ return context.from_existing(data=df)
File without changes