cloe-nessy 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +5 -0
  3. cloe_nessy/clients/api_client/__init__.py +3 -0
  4. cloe_nessy/clients/api_client/api_client.py +188 -0
  5. cloe_nessy/clients/api_client/api_response.py +72 -0
  6. cloe_nessy/clients/api_client/auth.py +178 -0
  7. cloe_nessy/clients/api_client/exceptions.py +22 -0
  8. cloe_nessy/file_utilities/__init__.py +3 -0
  9. cloe_nessy/file_utilities/exceptions.py +4 -0
  10. cloe_nessy/file_utilities/factory.py +42 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +72 -0
  12. cloe_nessy/file_utilities/location_types.py +29 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +6 -0
  20. cloe_nessy/integration/reader/api_reader.py +141 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +49 -0
  22. cloe_nessy/integration/reader/excel_reader.py +170 -0
  23. cloe_nessy/integration/reader/exceptions.py +10 -0
  24. cloe_nessy/integration/reader/file_reader.py +96 -0
  25. cloe_nessy/integration/reader/reader.py +34 -0
  26. cloe_nessy/integration/writer/__init__.py +3 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +48 -0
  28. cloe_nessy/logging/__init__.py +3 -0
  29. cloe_nessy/logging/logger_mixin.py +162 -0
  30. cloe_nessy/models/__init__.py +13 -0
  31. cloe_nessy/models/column.py +65 -0
  32. cloe_nessy/models/constraint.py +9 -0
  33. cloe_nessy/models/foreign_key.py +34 -0
  34. cloe_nessy/models/mixins/__init__.py +0 -0
  35. cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
  36. cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
  37. cloe_nessy/models/schema.py +76 -0
  38. cloe_nessy/models/table.py +236 -0
  39. cloe_nessy/models/types.py +7 -0
  40. cloe_nessy/object_manager/__init__.py +3 -0
  41. cloe_nessy/object_manager/table_manager.py +58 -0
  42. cloe_nessy/pipeline/__init__.py +7 -0
  43. cloe_nessy/pipeline/actions/__init__.py +50 -0
  44. cloe_nessy/pipeline/actions/read_api.py +178 -0
  45. cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
  46. cloe_nessy/pipeline/actions/read_excel.py +177 -0
  47. cloe_nessy/pipeline/actions/read_files.py +105 -0
  48. cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
  49. cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
  50. cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
  51. cloe_nessy/pipeline/actions/transform_decode.py +102 -0
  52. cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
  53. cloe_nessy/pipeline/actions/transform_filter.py +51 -0
  54. cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
  55. cloe_nessy/pipeline/actions/transform_join.py +81 -0
  56. cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
  57. cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
  58. cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
  59. cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
  60. cloe_nessy/pipeline/actions/transform_union.py +71 -0
  61. cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
  62. cloe_nessy/pipeline/pipeline.py +201 -0
  63. cloe_nessy/pipeline/pipeline_action.py +62 -0
  64. cloe_nessy/pipeline/pipeline_config.py +92 -0
  65. cloe_nessy/pipeline/pipeline_context.py +56 -0
  66. cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
  67. cloe_nessy/pipeline/pipeline_step.py +50 -0
  68. cloe_nessy/py.typed +0 -0
  69. cloe_nessy/session/__init__.py +3 -0
  70. cloe_nessy/session/session_manager.py +188 -0
  71. cloe_nessy/settings/__init__.py +3 -0
  72. cloe_nessy/settings/settings.py +91 -0
  73. cloe_nessy/utils/__init__.py +0 -0
  74. cloe_nessy/utils/file_and_directory_handler.py +19 -0
  75. cloe_nessy-0.2.9.dist-info/METADATA +26 -0
  76. cloe_nessy-0.2.9.dist-info/RECORD +78 -0
  77. cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
  78. cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,68 @@
1
+ from typing import Any
2
+
3
+ from ...integration.reader import CatalogReader
4
+ from ..pipeline_action import PipelineAction
5
+ from ..pipeline_context import PipelineContext
6
+
7
+
8
+ class ReadCatalogTableAction(PipelineAction):
9
+ """Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
10
+
11
+ This function retrieves data from a catalog table using the
12
+ [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader] identified
13
+ by either the `table_identifier` parameter or the `table_metadata` from the
14
+ provided `PipelineContext` of a previous step. The retrieved data is loaded
15
+ into a DataFrame and returned as part of an updated `PipelineContext`.
16
+
17
+ Example:
18
+ ```yaml
19
+ Read Sales Table:
20
+ action: READ_CATALOG_TABLE
21
+ options:
22
+ table_identifier: my_catalog.business_schema.sales_table
23
+ options: <options for the reader>
24
+ ```
25
+ """
26
+
27
+ name: str = "READ_CATALOG_TABLE"
28
+
29
+ @staticmethod
30
+ def run(
31
+ context: PipelineContext,
32
+ *,
33
+ table_identifier: str | None = None,
34
+ options: dict[str, str] | None = None,
35
+ **_: Any, # define kwargs to match the base class signature
36
+ ) -> PipelineContext:
37
+ """Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
38
+
39
+ Args:
40
+ context: The pipeline's context, which contains
41
+ metadata and configuration for the action.
42
+ table_identifier: The identifier of the catalog table to
43
+ read. If not provided, the function will attempt to use the table
44
+ identifier from the `table_metadata` in the `context`.
45
+ options: A dictionary of options for customizing
46
+ the catalog reader's behavior, such as filters or reading modes. Defaults
47
+ to None.
48
+
49
+ Raises:
50
+ ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
51
+
52
+ Returns:
53
+ An updated pipeline context containing the data read from the catalog table as a DataFrame.
54
+ """
55
+ if not options:
56
+ options = dict()
57
+
58
+ if (table_metadata := context.table_metadata) and table_identifier is None:
59
+ table_identifier = table_metadata.identifier
60
+ if table_identifier is None:
61
+ raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
62
+
63
+ table_reader = CatalogReader()
64
+ df = table_reader.read(
65
+ table_identifier=table_identifier,
66
+ **options,
67
+ )
68
+ return context.from_existing(data=df)
@@ -0,0 +1,177 @@
1
+ from collections.abc import Callable
2
+ from functools import reduce
3
+
4
+ from pyspark.sql import DataFrame
5
+
6
+ from ...file_utilities import get_file_paths
7
+ from ...integration.reader import ExcelDataFrameReader
8
+ from ..pipeline_action import PipelineAction
9
+ from ..pipeline_context import PipelineContext
10
+
11
+
12
+ class ReadExcelAction(PipelineAction):
13
+ """Reads data from an Excel file or directory of Excel files and returns a DataFrame.
14
+
15
+ The function reads Excel files using the
16
+ [`ExcelDataFrameReader`][cloe_nessy.integration.reader.excel_reader] either
17
+ from a single file or a directory path. It can read specific sheets, handle
18
+ file extensions, and offers various options to customize how the data is
19
+ read, such as specifying headers, index columns, and handling missing
20
+ values. The resulting data is returned as a DataFrame, and metadata about
21
+ the read files can be included in the context.
22
+
23
+ Example:
24
+ ```yaml
25
+ Read Excel Table:
26
+ action: READ_EXCEL
27
+ options:
28
+ file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
29
+ usecols:
30
+ - key_column
31
+ - interesting_column
32
+ options: <more options for the reader>
33
+ ```
34
+ """
35
+
36
+ name: str = "READ_EXCEL"
37
+
38
+ def run(
39
+ self,
40
+ context: PipelineContext,
41
+ *,
42
+ file: str | None = None,
43
+ path: str | None = None,
44
+ extension: str = "xlsx",
45
+ recursive: bool = False,
46
+ sheet_name: str | int | list = 0,
47
+ sheet_name_as_column: bool = False,
48
+ header: int | list[int] = 0,
49
+ index_col: int | list[int] | None = None,
50
+ usecols: int | str | list | Callable | None = None,
51
+ dtype: str | None = None,
52
+ fillna: str | dict[str, list[str]] | dict[str, str] | None = None,
53
+ true_values: list | None = None,
54
+ false_values: list | None = None,
55
+ nrows: int | None = None,
56
+ na_values: list[str] | dict[str, list[str]] | None = None,
57
+ keep_default_na: bool = True,
58
+ parse_dates: bool | list | dict = False,
59
+ date_parser: Callable | None = None,
60
+ thousands: str | None = None,
61
+ include_index: bool = False,
62
+ options: dict | None = None,
63
+ add_metadata_column: bool = True,
64
+ load_as_strings: bool = False,
65
+ **_,
66
+ ) -> PipelineContext:
67
+ """Reads data from an Excel file or directory of Excel files and returns a DataFrame.
68
+
69
+ Args:
70
+ context: The context in which the action is executed.
71
+ file: The path to a single Excel file. Either `file` or `path` must be specified.
72
+ path: The directory path containing multiple Excel files. Either `file` or `path` must be specified.
73
+ extension: The file extension to look for when reading from a directory.
74
+ recursive: Whether to include subdirectories when reading from a directory path.
75
+ sheet_name: The sheet name(s) or index(es) to read from the Excel file.
76
+ sheet_name_as_column: Whether to add a column with the sheet name to the DataFrame.
77
+ header: Row number(s) to use as the column labels.
78
+ index_col: Column(s) to use as the index of the DataFrame.
79
+ usecols: Subset of columns to parse. Can be an integer, string, list,
80
+ or function.
81
+ dtype: Data type for the columns.
82
+ fillna: Method or value to use to fill NaN values.
83
+ true_values: Values to consider as True.
84
+ false_values: Values to consider as False.
85
+ nrows: Number of rows to parse.
86
+ na_values: Additional strings to recognize as NaN/NA.
87
+ keep_default_na: Whether to append default NaN values when custom `na_values` are specified.
88
+ parse_dates: Options for parsing date columns.
89
+ date_parser: Function to use for converting strings to datetime objects.
90
+ thousands: Thousands separator to use when parsing numeric columns.
91
+ include_index: Whether to include an index column in the output DataFrame.
92
+ options: Additional options to pass to the DataFrame reader.
93
+ add_metadata_column: Whether to add a metadata column with file information to the DataFrame.
94
+ load_as_strings: Whether to load all columns as strings.
95
+
96
+ Raises:
97
+ ValueError: Raised if both `file` and `path` are specified, or if neither is provided.
98
+
99
+ Returns:
100
+ The updated context, with the read data as a DataFrame.
101
+ """
102
+ if not options:
103
+ options = dict()
104
+
105
+ if file is not None and path is not None:
106
+ self._tabular_logger.error("message: Only one of file or path have to be specified.")
107
+ raise ValueError("Only one of file or path have to be specified.")
108
+
109
+ excel_reader = ExcelDataFrameReader()
110
+ if file is not None:
111
+ df = excel_reader.read(
112
+ location=file,
113
+ sheet_name=sheet_name,
114
+ sheet_name_as_column=sheet_name_as_column,
115
+ header=header,
116
+ index_col=index_col,
117
+ usecols=usecols,
118
+ true_values=true_values,
119
+ false_values=false_values,
120
+ nrows=nrows,
121
+ dtype=dtype,
122
+ fillna=fillna,
123
+ na_values=na_values,
124
+ keep_default_na=keep_default_na,
125
+ parse_dates=parse_dates,
126
+ date_parser=date_parser,
127
+ thousands=thousands,
128
+ include_index=include_index,
129
+ options=options,
130
+ add_metadata_column=add_metadata_column,
131
+ load_as_strings=load_as_strings,
132
+ )
133
+ elif path is not None:
134
+ file_list = get_file_paths(path, extension, recursive)
135
+ df_dict: dict = {}
136
+ for path in file_list:
137
+ df_dict[path] = excel_reader.read(
138
+ location=path,
139
+ sheet_name=sheet_name,
140
+ sheet_name_as_column=sheet_name_as_column,
141
+ header=header,
142
+ index_col=index_col,
143
+ usecols=usecols,
144
+ dtype=dtype,
145
+ fillna=fillna,
146
+ true_values=true_values,
147
+ false_values=false_values,
148
+ nrows=nrows,
149
+ na_values=na_values,
150
+ keep_default_na=keep_default_na,
151
+ parse_dates=parse_dates,
152
+ date_parser=date_parser,
153
+ thousands=thousands,
154
+ include_index=include_index,
155
+ options=options,
156
+ add_metadata_column=add_metadata_column,
157
+ load_as_strings=load_as_strings,
158
+ )
159
+ df = reduce(DataFrame.unionAll, list(df_dict.values()))
160
+
161
+ else:
162
+ self._tabular_logger.error("action_name: READ_EXCEL | message: Either file or path have to be specified.")
163
+ raise ValueError("Either file or path have to be specified.")
164
+
165
+ runtime_info = context.runtime_info
166
+
167
+ if add_metadata_column:
168
+ read_files_list = list(set([x.file_path for x in df.select("__metadata.file_path").collect()]))
169
+ if runtime_info is None:
170
+ runtime_info = {"read_files": read_files_list}
171
+ else:
172
+ try:
173
+ runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
174
+ except KeyError:
175
+ runtime_info["read_files"] = read_files_list
176
+
177
+ return context.from_existing(data=df)
@@ -0,0 +1,105 @@
1
+ from typing import Any
2
+
3
+ from ...integration.reader import FileReader
4
+ from ..pipeline_action import PipelineAction
5
+ from ..pipeline_context import PipelineContext
6
+
7
+
8
+ class ReadFilesAction(PipelineAction):
9
+ """Reads files from a specified location.
10
+
11
+ If an extension is provided, all files with the given extension will be read
12
+ using the [`FileReader`][cloe_nessy.integration.reader.file_reader]. If no
13
+ extension is provided, the `spark_format` must be set, and all files in the
14
+ location will be read using a DataFrameReader with the specified format.
15
+
16
+ Example:
17
+ ```yaml
18
+ Read Excel Table:
19
+ action: READ_FILES
20
+ options:
21
+ location: excel_file_folder/excel_files_june/
22
+ search_subdirs: True
23
+ spark_format: AVRO
24
+ ```
25
+ """
26
+
27
+ name: str = "READ_FILES"
28
+
29
+ @staticmethod
30
+ def run(
31
+ context: PipelineContext,
32
+ *,
33
+ location: str | None = None,
34
+ search_subdirs: bool = False,
35
+ extension: str | None = None,
36
+ spark_format: str | None = None,
37
+ schema: str | None = None,
38
+ add_metadata_column: bool = True,
39
+ options: dict[str, str] | None = None,
40
+ **_: Any,
41
+ ) -> PipelineContext:
42
+ """Reads files from a specified location.
43
+
44
+ Args:
45
+ context: The context in which this Action is executed.
46
+ location: The location from which to read files.
47
+ search_subdirs: Recursively search subdirectories for files
48
+ if an extension is provided.
49
+ extension: The file extension to filter files by.
50
+ spark_format: The format to use for reading the files.
51
+ schema: The schema of the data. If None, schema is obtained from
52
+ the context metadata.
53
+ add_metadata_column: Whether to include the `__metadata` column with
54
+ file metadata in the DataFrame.
55
+ options: Additional options passed to the reader.
56
+
57
+ Raises:
58
+ ValueError: If neither `extension` nor `spark_format` are provided, or if
59
+ no location is specified.
60
+
61
+ Returns:
62
+ The context after the Action has been executed, containing the read data as a DataFrame.
63
+ """
64
+ if not location:
65
+ raise ValueError("No location provided. Please specify location to read files from.")
66
+ if not options:
67
+ options = dict()
68
+
69
+ if (metadata := context.table_metadata) and schema is None:
70
+ schema = metadata.schema
71
+
72
+ file_reader = FileReader()
73
+ if extension:
74
+ df = file_reader.read(
75
+ location=location,
76
+ schema=schema,
77
+ extension=extension,
78
+ search_subdirs=search_subdirs,
79
+ options=options,
80
+ add_metadata_column=add_metadata_column,
81
+ )
82
+ elif spark_format:
83
+ df = file_reader.read(
84
+ location=location,
85
+ schema=schema,
86
+ spark_format=spark_format,
87
+ options=options,
88
+ add_metadata_column=add_metadata_column,
89
+ )
90
+ else:
91
+ raise ValueError("Please provide either the 'extension' or 'spark_format'")
92
+
93
+ runtime_info = context.runtime_info
94
+
95
+ if add_metadata_column:
96
+ read_files_list = [x.file_path for x in df.select("__metadata.file_path").drop_duplicates().collect()]
97
+ if runtime_info is None:
98
+ runtime_info = {"read_files": read_files_list}
99
+ else:
100
+ try:
101
+ runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
102
+ except KeyError:
103
+ runtime_info["read_files"] = read_files_list
104
+
105
+ return context.from_existing(data=df, runtime_info=runtime_info)
@@ -0,0 +1,66 @@
1
+ import pathlib
2
+ from typing import Any
3
+
4
+ from ...models import Schema
5
+ from ..pipeline_action import PipelineAction
6
+ from ..pipeline_context import PipelineContext
7
+
8
+
9
+ class ReadMetadataYAMLAction(PipelineAction):
10
+ """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
11
+
12
+ Example:
13
+ ```yaml
14
+ Read Schema Metadata:
15
+ action: READ_METADATA_YAML_ACTION
16
+ options:
17
+ path: excel_file_folder/excel_files_june/
18
+ file_name: sales_schema.yml
19
+ table_name: sales
20
+ ```
21
+ """
22
+
23
+ name: str = "READ_METADATA_YAML_ACTION"
24
+
25
+ @staticmethod
26
+ def run(
27
+ context: PipelineContext,
28
+ *,
29
+ path: str | None = None,
30
+ file_name: str | None = None,
31
+ table_name: str | None = None,
32
+ **_: Any,
33
+ ) -> PipelineContext:
34
+ """Reads schema metadata from a yaml file using the `Schema` model.
35
+
36
+ Args:
37
+ context: The context in which this Action is executed.
38
+ path: The path to the data contract directory.
39
+ file_name: The name of the file that defines the schema.
40
+ table_name: The name of the table for which to retrieve metadata.
41
+
42
+ Raises:
43
+ ValueError: If any issues occur while reading the schema, such as an invalid schema,
44
+ missing file, or missing path.
45
+
46
+ Returns:
47
+ The context after the execution of this Action, containing the table metadata.
48
+ """
49
+ if not path:
50
+ raise ValueError("No path provided. Please specify path to schema metadata.")
51
+ if not file_name:
52
+ raise ValueError("No file_name provided. Please specify file name.")
53
+ if not table_name:
54
+ raise ValueError("No table_name provided. Please specify table name.")
55
+
56
+ path_obj = pathlib.Path(path)
57
+
58
+ schema, errors = Schema.read_instance_from_file(path_obj / file_name)
59
+ if errors:
60
+ raise ValueError(f"Errors while reading schema metadata: {errors}")
61
+ if not schema:
62
+ raise ValueError("No schema found in metadata.")
63
+
64
+ table = schema.get_table_by_name(table_name=table_name)
65
+
66
+ return context.from_existing(table_metadata=table)
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+ import pyspark.sql.functions as F
4
+
5
+ from ..pipeline_action import PipelineAction
6
+ from ..pipeline_context import PipelineContext
7
+
8
+
9
+ class TransformChangeDatatypeAction(PipelineAction):
10
+ """Changes the datatypes of specified columns in the given DataFrame.
11
+
12
+ Example:
13
+ ```yaml
14
+ Transform Columns:
15
+ action: TRANSFORM_CHANGE_DATATYPE
16
+ options:
17
+ columns:
18
+ id: string
19
+ revenue: long
20
+ ```
21
+ """
22
+
23
+ name: str = "TRANSFORM_CHANGE_DATATYPE"
24
+
25
+ def run(
26
+ self,
27
+ context: PipelineContext,
28
+ *,
29
+ columns: dict[str, str] | None = None,
30
+ **_: Any, # define kwargs to match the base class signature
31
+ ) -> PipelineContext:
32
+ """Changes the datatypes of specified columns in the given DataFrame.
33
+
34
+ Args:
35
+ context: The context in which this Action is executed.
36
+ columns: A dictionary where the key is the column
37
+ name and the value is the desired datatype.
38
+
39
+ Raises:
40
+ ValueError: If no columns are provided.
41
+ ValueError: If the data from context is None.
42
+
43
+ Returns:
44
+ The context after the execution of this Action, containing the DataFrame with updated column datatypes.
45
+ """
46
+ if not columns:
47
+ raise ValueError("No columns provided.")
48
+
49
+ if context.data is None:
50
+ raise ValueError("Data from the context is required for the operation.")
51
+
52
+ df = context.data
53
+ change_columns = {col: F.col(col).cast(dtype) for col, dtype in columns.items()}
54
+ df = df.withColumns(change_columns) # type: ignore
55
+
56
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,88 @@
1
+ from typing import Any
2
+
3
+ import pyspark.sql.functions as F
4
+
5
+ from ..pipeline_action import PipelineAction
6
+ from ..pipeline_context import PipelineContext
7
+
8
+
9
+ class TransformConcatColumnsAction(PipelineAction):
10
+ """Concatenates the specified columns in the given DataFrame.
11
+
12
+ Example:
13
+ ```yaml
14
+ Concat Columns:
15
+ action: TRANSFORM_CONCAT_COLUMNS
16
+ options:
17
+ name: address
18
+ columns:
19
+ - street
20
+ - postcode
21
+ - country
22
+ separator: ', '
23
+ ```
24
+ """
25
+
26
+ name: str = "TRANSFORM_CONCAT_COLUMNS"
27
+
28
+ def run(
29
+ self,
30
+ context: PipelineContext,
31
+ *,
32
+ name: str = "",
33
+ columns: list[str] | None = None,
34
+ separator: str | None = None,
35
+ **_: Any,
36
+ ) -> PipelineContext:
37
+ """Concatenates the specified columns in the given DataFrame.
38
+
39
+ !!!warning
40
+
41
+ # Null Handling Behavior
42
+
43
+ The behavior of null handling differs based on whether a `separator` is provided:
44
+
45
+ - **When `separator` is specified**: The function uses Spark's
46
+ `concat_ws`, which **ignores `NULL` values**. In this case, `NULL`
47
+ values are treated as empty strings (`""`) and are excluded from the
48
+ final concatenated result.
49
+ - **When `separator` is not specified**: The function defaults to
50
+ using Spark's `concat`, which **returns `NULL` if any of the
51
+ concatenated values is `NULL`**. This means the presence of a `NULL`
52
+ in any input will make the entire output `NULL`.
53
+
54
+ Args:
55
+ context: The context in which this Action is executed.
56
+ name: The name of the new concatenated column.
57
+ columns: A list of columns to be concatenated.
58
+ separator: The separator used between concatenated column values.
59
+
60
+ Raises:
61
+ ValueError: If no name is provided.
62
+ ValueError: If no columns are provided.
63
+ ValueError: If the data from context is None.
64
+ ValueError: If 'columns' is not a list.
65
+
66
+ Returns:
67
+ The context after the execution of this Action, containing the
68
+ DataFrame with the concatenated column.
69
+ """
70
+ if not name:
71
+ raise ValueError("No name provided.")
72
+ if not columns:
73
+ raise ValueError("No columns provided.")
74
+
75
+ if context.data is None:
76
+ raise ValueError("The data from context is required for the operation.")
77
+
78
+ df = context.data
79
+
80
+ if isinstance(columns, list):
81
+ if separator:
82
+ df = df.withColumn(name, F.concat_ws(separator, *columns)) # type: ignore
83
+ else:
84
+ df = df.withColumn(name, F.concat(*columns)) # type: ignore
85
+ else:
86
+ raise ValueError("'columns' should be a list, like ['col1', 'col2',]")
87
+
88
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,102 @@
1
+ from typing import Any
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
5
+
6
+ from ..pipeline_action import PipelineAction
7
+ from ..pipeline_context import PipelineContext
8
+
9
+
10
+ class TransformDecodeAction(PipelineAction):
11
+ """Decodes values of a specified column in the DataFrame based on the given format.
12
+
13
+ Example:
14
+ ```yaml
15
+ Decode Columns:
16
+ action: TRANSFORM_DECODE
17
+ options:
18
+ column: configurations
19
+ input_format: json
20
+ ```
21
+ """
22
+
23
+ name: str = "TRANSFORM_DECODE"
24
+
25
+ def run(
26
+ self,
27
+ context: PipelineContext,
28
+ *,
29
+ column: str | None = None,
30
+ input_format: str | None = None,
31
+ schema: str | None = None,
32
+ **_: Any, # define kwargs to match the base class signature
33
+ ) -> PipelineContext:
34
+ """Decodes values of a specified column in the DataFrame based on the given format.
35
+
36
+ Args:
37
+ context: The context in which this Action is executed.
38
+ column: The name of the column that should be decoded.
39
+ input_format: The format from which the column should be decoded.
40
+ Currently supported formats are 'base64' and 'json'.
41
+ schema: For JSON input, the schema of the JSON object. If empty,
42
+ the schema is inferred from the first row of the DataFrame. For base64 input,
43
+ the data type to which the column is cast.
44
+
45
+ Raises:
46
+ ValueError: If no column is specified.
47
+ ValueError: If no input_format is specified.
48
+ ValueError: If the data from context is None.
49
+ ValueError: If an invalid input_format is provided.
50
+
51
+ Returns:
52
+ The context after the execution of this Action, containing the DataFrame with the decoded column(s).
53
+ """
54
+ if not column:
55
+ raise ValueError("No column specified.")
56
+ if not input_format:
57
+ raise ValueError("No input_format specified")
58
+ if context.data is None:
59
+ raise ValueError("Data from the context is required for the operation.")
60
+
61
+ df = context.data
62
+ match input_format.lower():
63
+ case "base64":
64
+ df = self._decode_base64(df, column, schema) # type: ignore
65
+ case "json":
66
+ df = self._decode_json(df, column, schema) # type: ignore
67
+ case _:
68
+ raise ValueError(
69
+ f"Invalid input_format: [ '{input_format}' ]. Please specify a valid format to decode.",
70
+ )
71
+
72
+ return context.from_existing(data=df) # type: ignore
73
+
74
+ def _decode_base64(self, df: DataFrame, column: str, base64_schema: str | None):
75
+ """Decode base64 column."""
76
+ df_decoded = df.withColumn(column, unbase64(col(column)))
77
+ if base64_schema:
78
+ df_decoded = df_decoded.withColumn(column, col(column).cast(base64_schema))
79
+ return df_decoded
80
+
81
+ def _decode_json(self, df: DataFrame, column: str, json_schema: str | None):
82
+ """Decode json column."""
83
+ distinct_schemas = (
84
+ df.select(column)
85
+ .withColumn("json_schema", schema_of_json(col(column)))
86
+ .select("json_schema")
87
+ .dropDuplicates()
88
+ )
89
+ if not (json_schema or distinct_schemas.count() > 0):
90
+ raise RuntimeError("Cannot infer schema from empty DataFrame.")
91
+
92
+ elif distinct_schemas.count() > 1:
93
+ raise RuntimeError(f"There is more than one JSON schema in column {column}.")
94
+
95
+ if json_schema is None:
96
+ final_json_schema = distinct_schemas.collect()[0].json_schema
97
+ else:
98
+ final_json_schema = json_schema # type: ignore
99
+
100
+ df_decoded = df.withColumn(column, from_json(col(column), final_json_schema)).select(*df.columns, f"{column}.*")
101
+
102
+ return df_decoded