cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.reader import CatalogReader
|
|
4
|
+
from ..pipeline_action import PipelineAction
|
|
5
|
+
from ..pipeline_context import PipelineContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReadCatalogTableAction(PipelineAction):
|
|
9
|
+
"""Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
|
|
10
|
+
|
|
11
|
+
This function retrieves data from a catalog table using the
|
|
12
|
+
[`CatalogReader`][cloe_nessy.integration.reader.catalog_reader] identified
|
|
13
|
+
by either the `table_identifier` parameter or the `table_metadata` from the
|
|
14
|
+
provided `PipelineContext` of a previous step. The retrieved data is loaded
|
|
15
|
+
into a DataFrame and returned as part of an updated `PipelineContext`.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
```yaml
|
|
19
|
+
Read Sales Table:
|
|
20
|
+
action: READ_CATALOG_TABLE
|
|
21
|
+
options:
|
|
22
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
23
|
+
options: <options for the reader>
|
|
24
|
+
```
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str = "READ_CATALOG_TABLE"
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def run(
|
|
31
|
+
context: PipelineContext,
|
|
32
|
+
*,
|
|
33
|
+
table_identifier: str | None = None,
|
|
34
|
+
options: dict[str, str] | None = None,
|
|
35
|
+
**_: Any, # define kwargs to match the base class signature
|
|
36
|
+
) -> PipelineContext:
|
|
37
|
+
"""Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
context: The pipeline's context, which contains
|
|
41
|
+
metadata and configuration for the action.
|
|
42
|
+
table_identifier: The identifier of the catalog table to
|
|
43
|
+
read. If not provided, the function will attempt to use the table
|
|
44
|
+
identifier from the `table_metadata` in the `context`.
|
|
45
|
+
options: A dictionary of options for customizing
|
|
46
|
+
the catalog reader's behavior, such as filters or reading modes. Defaults
|
|
47
|
+
to None.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
An updated pipeline context containing the data read from the catalog table as a DataFrame.
|
|
54
|
+
"""
|
|
55
|
+
if not options:
|
|
56
|
+
options = dict()
|
|
57
|
+
|
|
58
|
+
if (table_metadata := context.table_metadata) and table_identifier is None:
|
|
59
|
+
table_identifier = table_metadata.identifier
|
|
60
|
+
if table_identifier is None:
|
|
61
|
+
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
62
|
+
|
|
63
|
+
table_reader = CatalogReader()
|
|
64
|
+
df = table_reader.read(
|
|
65
|
+
table_identifier=table_identifier,
|
|
66
|
+
**options,
|
|
67
|
+
)
|
|
68
|
+
return context.from_existing(data=df)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from functools import reduce
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
|
|
6
|
+
from ...file_utilities import get_file_paths
|
|
7
|
+
from ...integration.reader import ExcelDataFrameReader
|
|
8
|
+
from ..pipeline_action import PipelineAction
|
|
9
|
+
from ..pipeline_context import PipelineContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ReadExcelAction(PipelineAction):
|
|
13
|
+
"""Reads data from an Excel file or directory of Excel files and returns a DataFrame.
|
|
14
|
+
|
|
15
|
+
The function reads Excel files using the
|
|
16
|
+
[`ExcelDataFrameReader`][cloe_nessy.integration.reader.excel_reader] either
|
|
17
|
+
from a single file or a directory path. It can read specific sheets, handle
|
|
18
|
+
file extensions, and offers various options to customize how the data is
|
|
19
|
+
read, such as specifying headers, index columns, and handling missing
|
|
20
|
+
values. The resulting data is returned as a DataFrame, and metadata about
|
|
21
|
+
the read files can be included in the context.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```yaml
|
|
25
|
+
Read Excel Table:
|
|
26
|
+
action: READ_EXCEL
|
|
27
|
+
options:
|
|
28
|
+
file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
|
|
29
|
+
usecols:
|
|
30
|
+
- key_column
|
|
31
|
+
- interesting_column
|
|
32
|
+
options: <more options for the reader>
|
|
33
|
+
```
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
name: str = "READ_EXCEL"
|
|
37
|
+
|
|
38
|
+
def run(
|
|
39
|
+
self,
|
|
40
|
+
context: PipelineContext,
|
|
41
|
+
*,
|
|
42
|
+
file: str | None = None,
|
|
43
|
+
path: str | None = None,
|
|
44
|
+
extension: str = "xlsx",
|
|
45
|
+
recursive: bool = False,
|
|
46
|
+
sheet_name: str | int | list = 0,
|
|
47
|
+
sheet_name_as_column: bool = False,
|
|
48
|
+
header: int | list[int] = 0,
|
|
49
|
+
index_col: int | list[int] | None = None,
|
|
50
|
+
usecols: int | str | list | Callable | None = None,
|
|
51
|
+
dtype: str | None = None,
|
|
52
|
+
fillna: str | dict[str, list[str]] | dict[str, str] | None = None,
|
|
53
|
+
true_values: list | None = None,
|
|
54
|
+
false_values: list | None = None,
|
|
55
|
+
nrows: int | None = None,
|
|
56
|
+
na_values: list[str] | dict[str, list[str]] | None = None,
|
|
57
|
+
keep_default_na: bool = True,
|
|
58
|
+
parse_dates: bool | list | dict = False,
|
|
59
|
+
date_parser: Callable | None = None,
|
|
60
|
+
thousands: str | None = None,
|
|
61
|
+
include_index: bool = False,
|
|
62
|
+
options: dict | None = None,
|
|
63
|
+
add_metadata_column: bool = True,
|
|
64
|
+
load_as_strings: bool = False,
|
|
65
|
+
**_,
|
|
66
|
+
) -> PipelineContext:
|
|
67
|
+
"""Reads data from an Excel file or directory of Excel files and returns a DataFrame.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
context: The context in which the action is executed.
|
|
71
|
+
file: The path to a single Excel file. Either `file` or `path` must be specified.
|
|
72
|
+
path: The directory path containing multiple Excel files. Either `file` or `path` must be specified.
|
|
73
|
+
extension: The file extension to look for when reading from a directory.
|
|
74
|
+
recursive: Whether to include subdirectories when reading from a directory path.
|
|
75
|
+
sheet_name: The sheet name(s) or index(es) to read from the Excel file.
|
|
76
|
+
sheet_name_as_column: Whether to add a column with the sheet name to the DataFrame.
|
|
77
|
+
header: Row number(s) to use as the column labels.
|
|
78
|
+
index_col: Column(s) to use as the index of the DataFrame.
|
|
79
|
+
usecols: Subset of columns to parse. Can be an integer, string, list,
|
|
80
|
+
or function.
|
|
81
|
+
dtype: Data type for the columns.
|
|
82
|
+
fillna: Method or value to use to fill NaN values.
|
|
83
|
+
true_values: Values to consider as True.
|
|
84
|
+
false_values: Values to consider as False.
|
|
85
|
+
nrows: Number of rows to parse.
|
|
86
|
+
na_values: Additional strings to recognize as NaN/NA.
|
|
87
|
+
keep_default_na: Whether to append default NaN values when custom `na_values` are specified.
|
|
88
|
+
parse_dates: Options for parsing date columns.
|
|
89
|
+
date_parser: Function to use for converting strings to datetime objects.
|
|
90
|
+
thousands: Thousands separator to use when parsing numeric columns.
|
|
91
|
+
include_index: Whether to include an index column in the output DataFrame.
|
|
92
|
+
options: Additional options to pass to the DataFrame reader.
|
|
93
|
+
add_metadata_column: Whether to add a metadata column with file information to the DataFrame.
|
|
94
|
+
load_as_strings: Whether to load all columns as strings.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ValueError: Raised if both `file` and `path` are specified, or if neither is provided.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
The updated context, with the read data as a DataFrame.
|
|
101
|
+
"""
|
|
102
|
+
if not options:
|
|
103
|
+
options = dict()
|
|
104
|
+
|
|
105
|
+
if file is not None and path is not None:
|
|
106
|
+
self._tabular_logger.error("message: Only one of file or path have to be specified.")
|
|
107
|
+
raise ValueError("Only one of file or path have to be specified.")
|
|
108
|
+
|
|
109
|
+
excel_reader = ExcelDataFrameReader()
|
|
110
|
+
if file is not None:
|
|
111
|
+
df = excel_reader.read(
|
|
112
|
+
location=file,
|
|
113
|
+
sheet_name=sheet_name,
|
|
114
|
+
sheet_name_as_column=sheet_name_as_column,
|
|
115
|
+
header=header,
|
|
116
|
+
index_col=index_col,
|
|
117
|
+
usecols=usecols,
|
|
118
|
+
true_values=true_values,
|
|
119
|
+
false_values=false_values,
|
|
120
|
+
nrows=nrows,
|
|
121
|
+
dtype=dtype,
|
|
122
|
+
fillna=fillna,
|
|
123
|
+
na_values=na_values,
|
|
124
|
+
keep_default_na=keep_default_na,
|
|
125
|
+
parse_dates=parse_dates,
|
|
126
|
+
date_parser=date_parser,
|
|
127
|
+
thousands=thousands,
|
|
128
|
+
include_index=include_index,
|
|
129
|
+
options=options,
|
|
130
|
+
add_metadata_column=add_metadata_column,
|
|
131
|
+
load_as_strings=load_as_strings,
|
|
132
|
+
)
|
|
133
|
+
elif path is not None:
|
|
134
|
+
file_list = get_file_paths(path, extension, recursive)
|
|
135
|
+
df_dict: dict = {}
|
|
136
|
+
for path in file_list:
|
|
137
|
+
df_dict[path] = excel_reader.read(
|
|
138
|
+
location=path,
|
|
139
|
+
sheet_name=sheet_name,
|
|
140
|
+
sheet_name_as_column=sheet_name_as_column,
|
|
141
|
+
header=header,
|
|
142
|
+
index_col=index_col,
|
|
143
|
+
usecols=usecols,
|
|
144
|
+
dtype=dtype,
|
|
145
|
+
fillna=fillna,
|
|
146
|
+
true_values=true_values,
|
|
147
|
+
false_values=false_values,
|
|
148
|
+
nrows=nrows,
|
|
149
|
+
na_values=na_values,
|
|
150
|
+
keep_default_na=keep_default_na,
|
|
151
|
+
parse_dates=parse_dates,
|
|
152
|
+
date_parser=date_parser,
|
|
153
|
+
thousands=thousands,
|
|
154
|
+
include_index=include_index,
|
|
155
|
+
options=options,
|
|
156
|
+
add_metadata_column=add_metadata_column,
|
|
157
|
+
load_as_strings=load_as_strings,
|
|
158
|
+
)
|
|
159
|
+
df = reduce(DataFrame.unionAll, list(df_dict.values()))
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
self._tabular_logger.error("action_name: READ_EXCEL | message: Either file or path have to be specified.")
|
|
163
|
+
raise ValueError("Either file or path have to be specified.")
|
|
164
|
+
|
|
165
|
+
runtime_info = context.runtime_info
|
|
166
|
+
|
|
167
|
+
if add_metadata_column:
|
|
168
|
+
read_files_list = list(set([x.file_path for x in df.select("__metadata.file_path").collect()]))
|
|
169
|
+
if runtime_info is None:
|
|
170
|
+
runtime_info = {"read_files": read_files_list}
|
|
171
|
+
else:
|
|
172
|
+
try:
|
|
173
|
+
runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
|
|
174
|
+
except KeyError:
|
|
175
|
+
runtime_info["read_files"] = read_files_list
|
|
176
|
+
|
|
177
|
+
return context.from_existing(data=df)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.reader import FileReader
|
|
4
|
+
from ..pipeline_action import PipelineAction
|
|
5
|
+
from ..pipeline_context import PipelineContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReadFilesAction(PipelineAction):
|
|
9
|
+
"""Reads files from a specified location.
|
|
10
|
+
|
|
11
|
+
If an extension is provided, all files with the given extension will be read
|
|
12
|
+
using the [`FileReader`][cloe_nessy.integration.reader.file_reader]. If no
|
|
13
|
+
extension is provided, the `spark_format` must be set, and all files in the
|
|
14
|
+
location will be read using a DataFrameReader with the specified format.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
```yaml
|
|
18
|
+
Read Excel Table:
|
|
19
|
+
action: READ_FILES
|
|
20
|
+
options:
|
|
21
|
+
location: excel_file_folder/excel_files_june/
|
|
22
|
+
search_subdirs: True
|
|
23
|
+
spark_format: AVRO
|
|
24
|
+
```
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str = "READ_FILES"
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def run(
|
|
31
|
+
context: PipelineContext,
|
|
32
|
+
*,
|
|
33
|
+
location: str | None = None,
|
|
34
|
+
search_subdirs: bool = False,
|
|
35
|
+
extension: str | None = None,
|
|
36
|
+
spark_format: str | None = None,
|
|
37
|
+
schema: str | None = None,
|
|
38
|
+
add_metadata_column: bool = True,
|
|
39
|
+
options: dict[str, str] | None = None,
|
|
40
|
+
**_: Any,
|
|
41
|
+
) -> PipelineContext:
|
|
42
|
+
"""Reads files from a specified location.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
context: The context in which this Action is executed.
|
|
46
|
+
location: The location from which to read files.
|
|
47
|
+
search_subdirs: Recursively search subdirectories for files
|
|
48
|
+
if an extension is provided.
|
|
49
|
+
extension: The file extension to filter files by.
|
|
50
|
+
spark_format: The format to use for reading the files.
|
|
51
|
+
schema: The schema of the data. If None, schema is obtained from
|
|
52
|
+
the context metadata.
|
|
53
|
+
add_metadata_column: Whether to include the `__metadata` column with
|
|
54
|
+
file metadata in the DataFrame.
|
|
55
|
+
options: Additional options passed to the reader.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: If neither `extension` nor `spark_format` are provided, or if
|
|
59
|
+
no location is specified.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The context after the Action has been executed, containing the read data as a DataFrame.
|
|
63
|
+
"""
|
|
64
|
+
if not location:
|
|
65
|
+
raise ValueError("No location provided. Please specify location to read files from.")
|
|
66
|
+
if not options:
|
|
67
|
+
options = dict()
|
|
68
|
+
|
|
69
|
+
if (metadata := context.table_metadata) and schema is None:
|
|
70
|
+
schema = metadata.schema
|
|
71
|
+
|
|
72
|
+
file_reader = FileReader()
|
|
73
|
+
if extension:
|
|
74
|
+
df = file_reader.read(
|
|
75
|
+
location=location,
|
|
76
|
+
schema=schema,
|
|
77
|
+
extension=extension,
|
|
78
|
+
search_subdirs=search_subdirs,
|
|
79
|
+
options=options,
|
|
80
|
+
add_metadata_column=add_metadata_column,
|
|
81
|
+
)
|
|
82
|
+
elif spark_format:
|
|
83
|
+
df = file_reader.read(
|
|
84
|
+
location=location,
|
|
85
|
+
schema=schema,
|
|
86
|
+
spark_format=spark_format,
|
|
87
|
+
options=options,
|
|
88
|
+
add_metadata_column=add_metadata_column,
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError("Please provide either the 'extension' or 'spark_format'")
|
|
92
|
+
|
|
93
|
+
runtime_info = context.runtime_info
|
|
94
|
+
|
|
95
|
+
if add_metadata_column:
|
|
96
|
+
read_files_list = [x.file_path for x in df.select("__metadata.file_path").drop_duplicates().collect()]
|
|
97
|
+
if runtime_info is None:
|
|
98
|
+
runtime_info = {"read_files": read_files_list}
|
|
99
|
+
else:
|
|
100
|
+
try:
|
|
101
|
+
runtime_info["read_files"] = list(set(runtime_info["read_files"] + read_files_list))
|
|
102
|
+
except KeyError:
|
|
103
|
+
runtime_info["read_files"] = read_files_list
|
|
104
|
+
|
|
105
|
+
return context.from_existing(data=df, runtime_info=runtime_info)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from ...models import Schema
|
|
5
|
+
from ..pipeline_action import PipelineAction
|
|
6
|
+
from ..pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReadMetadataYAMLAction(PipelineAction):
|
|
10
|
+
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```yaml
|
|
14
|
+
Read Schema Metadata:
|
|
15
|
+
action: READ_METADATA_YAML_ACTION
|
|
16
|
+
options:
|
|
17
|
+
path: excel_file_folder/excel_files_june/
|
|
18
|
+
file_name: sales_schema.yml
|
|
19
|
+
table_name: sales
|
|
20
|
+
```
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name: str = "READ_METADATA_YAML_ACTION"
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def run(
|
|
27
|
+
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
path: str | None = None,
|
|
30
|
+
file_name: str | None = None,
|
|
31
|
+
table_name: str | None = None,
|
|
32
|
+
**_: Any,
|
|
33
|
+
) -> PipelineContext:
|
|
34
|
+
"""Reads schema metadata from a yaml file using the `Schema` model.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
context: The context in which this Action is executed.
|
|
38
|
+
path: The path to the data contract directory.
|
|
39
|
+
file_name: The name of the file that defines the schema.
|
|
40
|
+
table_name: The name of the table for which to retrieve metadata.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If any issues occur while reading the schema, such as an invalid schema,
|
|
44
|
+
missing file, or missing path.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The context after the execution of this Action, containing the table metadata.
|
|
48
|
+
"""
|
|
49
|
+
if not path:
|
|
50
|
+
raise ValueError("No path provided. Please specify path to schema metadata.")
|
|
51
|
+
if not file_name:
|
|
52
|
+
raise ValueError("No file_name provided. Please specify file name.")
|
|
53
|
+
if not table_name:
|
|
54
|
+
raise ValueError("No table_name provided. Please specify table name.")
|
|
55
|
+
|
|
56
|
+
path_obj = pathlib.Path(path)
|
|
57
|
+
|
|
58
|
+
schema, errors = Schema.read_instance_from_file(path_obj / file_name)
|
|
59
|
+
if errors:
|
|
60
|
+
raise ValueError(f"Errors while reading schema metadata: {errors}")
|
|
61
|
+
if not schema:
|
|
62
|
+
raise ValueError("No schema found in metadata.")
|
|
63
|
+
|
|
64
|
+
table = schema.get_table_by_name(table_name=table_name)
|
|
65
|
+
|
|
66
|
+
return context.from_existing(table_metadata=table)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import pyspark.sql.functions as F
|
|
4
|
+
|
|
5
|
+
from ..pipeline_action import PipelineAction
|
|
6
|
+
from ..pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformChangeDatatypeAction(PipelineAction):
|
|
10
|
+
"""Changes the datatypes of specified columns in the given DataFrame.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```yaml
|
|
14
|
+
Transform Columns:
|
|
15
|
+
action: TRANSFORM_CHANGE_DATATYPE
|
|
16
|
+
options:
|
|
17
|
+
columns:
|
|
18
|
+
id: string
|
|
19
|
+
revenue: long
|
|
20
|
+
```
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name: str = "TRANSFORM_CHANGE_DATATYPE"
|
|
24
|
+
|
|
25
|
+
def run(
|
|
26
|
+
self,
|
|
27
|
+
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
columns: dict[str, str] | None = None,
|
|
30
|
+
**_: Any, # define kwargs to match the base class signature
|
|
31
|
+
) -> PipelineContext:
|
|
32
|
+
"""Changes the datatypes of specified columns in the given DataFrame.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
context: The context in which this Action is executed.
|
|
36
|
+
columns: A dictionary where the key is the column
|
|
37
|
+
name and the value is the desired datatype.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If no columns are provided.
|
|
41
|
+
ValueError: If the data from context is None.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The context after the execution of this Action, containing the DataFrame with updated column datatypes.
|
|
45
|
+
"""
|
|
46
|
+
if not columns:
|
|
47
|
+
raise ValueError("No columns provided.")
|
|
48
|
+
|
|
49
|
+
if context.data is None:
|
|
50
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
51
|
+
|
|
52
|
+
df = context.data
|
|
53
|
+
change_columns = {col: F.col(col).cast(dtype) for col, dtype in columns.items()}
|
|
54
|
+
df = df.withColumns(change_columns) # type: ignore
|
|
55
|
+
|
|
56
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import pyspark.sql.functions as F
|
|
4
|
+
|
|
5
|
+
from ..pipeline_action import PipelineAction
|
|
6
|
+
from ..pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformConcatColumnsAction(PipelineAction):
|
|
10
|
+
"""Concatenates the specified columns in the given DataFrame.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```yaml
|
|
14
|
+
Concat Columns:
|
|
15
|
+
action: TRANSFORM_CONCAT_COLUMNS
|
|
16
|
+
options:
|
|
17
|
+
name: address
|
|
18
|
+
columns:
|
|
19
|
+
- street
|
|
20
|
+
- postcode
|
|
21
|
+
- country
|
|
22
|
+
separator: ', '
|
|
23
|
+
```
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name: str = "TRANSFORM_CONCAT_COLUMNS"
|
|
27
|
+
|
|
28
|
+
def run(
|
|
29
|
+
self,
|
|
30
|
+
context: PipelineContext,
|
|
31
|
+
*,
|
|
32
|
+
name: str = "",
|
|
33
|
+
columns: list[str] | None = None,
|
|
34
|
+
separator: str | None = None,
|
|
35
|
+
**_: Any,
|
|
36
|
+
) -> PipelineContext:
|
|
37
|
+
"""Concatenates the specified columns in the given DataFrame.
|
|
38
|
+
|
|
39
|
+
!!!warning
|
|
40
|
+
|
|
41
|
+
# Null Handling Behavior
|
|
42
|
+
|
|
43
|
+
The behavior of null handling differs based on whether a `separator` is provided:
|
|
44
|
+
|
|
45
|
+
- **When `separator` is specified**: The function uses Spark's
|
|
46
|
+
`concat_ws`, which **ignores `NULL` values**. In this case, `NULL`
|
|
47
|
+
values are treated as empty strings (`""`) and are excluded from the
|
|
48
|
+
final concatenated result.
|
|
49
|
+
- **When `separator` is not specified**: The function defaults to
|
|
50
|
+
using Spark's `concat`, which **returns `NULL` if any of the
|
|
51
|
+
concatenated values is `NULL`**. This means the presence of a `NULL`
|
|
52
|
+
in any input will make the entire output `NULL`.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
context: The context in which this Action is executed.
|
|
56
|
+
name: The name of the new concatenated column.
|
|
57
|
+
columns: A list of columns to be concatenated.
|
|
58
|
+
separator: The separator used between concatenated column values.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
ValueError: If no name is provided.
|
|
62
|
+
ValueError: If no columns are provided.
|
|
63
|
+
ValueError: If the data from context is None.
|
|
64
|
+
ValueError: If 'columns' is not a list.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
The context after the execution of this Action, containing the
|
|
68
|
+
DataFrame with the concatenated column.
|
|
69
|
+
"""
|
|
70
|
+
if not name:
|
|
71
|
+
raise ValueError("No name provided.")
|
|
72
|
+
if not columns:
|
|
73
|
+
raise ValueError("No columns provided.")
|
|
74
|
+
|
|
75
|
+
if context.data is None:
|
|
76
|
+
raise ValueError("The data from context is required for the operation.")
|
|
77
|
+
|
|
78
|
+
df = context.data
|
|
79
|
+
|
|
80
|
+
if isinstance(columns, list):
|
|
81
|
+
if separator:
|
|
82
|
+
df = df.withColumn(name, F.concat_ws(separator, *columns)) # type: ignore
|
|
83
|
+
else:
|
|
84
|
+
df = df.withColumn(name, F.concat(*columns)) # type: ignore
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError("'columns' should be a list, like ['col1', 'col2',]")
|
|
87
|
+
|
|
88
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
|
|
5
|
+
|
|
6
|
+
from ..pipeline_action import PipelineAction
|
|
7
|
+
from ..pipeline_context import PipelineContext
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TransformDecodeAction(PipelineAction):
|
|
11
|
+
"""Decodes values of a specified column in the DataFrame based on the given format.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
```yaml
|
|
15
|
+
Decode Columns:
|
|
16
|
+
action: TRANSFORM_DECODE
|
|
17
|
+
options:
|
|
18
|
+
column: configurations
|
|
19
|
+
input_format: json
|
|
20
|
+
```
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name: str = "TRANSFORM_DECODE"
|
|
24
|
+
|
|
25
|
+
def run(
|
|
26
|
+
self,
|
|
27
|
+
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
column: str | None = None,
|
|
30
|
+
input_format: str | None = None,
|
|
31
|
+
schema: str | None = None,
|
|
32
|
+
**_: Any, # define kwargs to match the base class signature
|
|
33
|
+
) -> PipelineContext:
|
|
34
|
+
"""Decodes values of a specified column in the DataFrame based on the given format.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
context: The context in which this Action is executed.
|
|
38
|
+
column: The name of the column that should be decoded.
|
|
39
|
+
input_format: The format from which the column should be decoded.
|
|
40
|
+
Currently supported formats are 'base64' and 'json'.
|
|
41
|
+
schema: For JSON input, the schema of the JSON object. If empty,
|
|
42
|
+
the schema is inferred from the first row of the DataFrame. For base64 input,
|
|
43
|
+
the data type to which the column is cast.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If no column is specified.
|
|
47
|
+
ValueError: If no input_format is specified.
|
|
48
|
+
ValueError: If the data from context is None.
|
|
49
|
+
ValueError: If an invalid input_format is provided.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
The context after the execution of this Action, containing the DataFrame with the decoded column(s).
|
|
53
|
+
"""
|
|
54
|
+
if not column:
|
|
55
|
+
raise ValueError("No column specified.")
|
|
56
|
+
if not input_format:
|
|
57
|
+
raise ValueError("No input_format specified")
|
|
58
|
+
if context.data is None:
|
|
59
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
60
|
+
|
|
61
|
+
df = context.data
|
|
62
|
+
match input_format.lower():
|
|
63
|
+
case "base64":
|
|
64
|
+
df = self._decode_base64(df, column, schema) # type: ignore
|
|
65
|
+
case "json":
|
|
66
|
+
df = self._decode_json(df, column, schema) # type: ignore
|
|
67
|
+
case _:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Invalid input_format: [ '{input_format}' ]. Please specify a valid format to decode.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return context.from_existing(data=df) # type: ignore
|
|
73
|
+
|
|
74
|
+
def _decode_base64(self, df: DataFrame, column: str, base64_schema: str | None):
|
|
75
|
+
"""Decode base64 column."""
|
|
76
|
+
df_decoded = df.withColumn(column, unbase64(col(column)))
|
|
77
|
+
if base64_schema:
|
|
78
|
+
df_decoded = df_decoded.withColumn(column, col(column).cast(base64_schema))
|
|
79
|
+
return df_decoded
|
|
80
|
+
|
|
81
|
+
def _decode_json(self, df: DataFrame, column: str, json_schema: str | None):
|
|
82
|
+
"""Decode json column."""
|
|
83
|
+
distinct_schemas = (
|
|
84
|
+
df.select(column)
|
|
85
|
+
.withColumn("json_schema", schema_of_json(col(column)))
|
|
86
|
+
.select("json_schema")
|
|
87
|
+
.dropDuplicates()
|
|
88
|
+
)
|
|
89
|
+
if not (json_schema or distinct_schemas.count() > 0):
|
|
90
|
+
raise RuntimeError("Cannot infer schema from empty DataFrame.")
|
|
91
|
+
|
|
92
|
+
elif distinct_schemas.count() > 1:
|
|
93
|
+
raise RuntimeError(f"There is more than one JSON schema in column {column}.")
|
|
94
|
+
|
|
95
|
+
if json_schema is None:
|
|
96
|
+
final_json_schema = distinct_schemas.collect()[0].json_schema
|
|
97
|
+
else:
|
|
98
|
+
final_json_schema = json_schema # type: ignore
|
|
99
|
+
|
|
100
|
+
df_decoded = df.withColumn(column, from_json(col(column), final_json_schema)).select(*df.columns, f"{column}.*")
|
|
101
|
+
|
|
102
|
+
return df_decoded
|