cloe-nessy 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/file_reader.py +7 -1
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/read_api.py +69 -45
- cloe_nessy/pipeline/actions/read_catalog_table.py +9 -9
- cloe_nessy/pipeline/actions/read_excel.py +14 -10
- cloe_nessy/pipeline/actions/read_files.py +54 -28
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +9 -9
- cloe_nessy/pipeline/actions/transform_change_datatype.py +13 -8
- cloe_nessy/pipeline/actions/transform_clean_column_names.py +81 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +25 -11
- cloe_nessy/pipeline/actions/transform_decode.py +18 -7
- cloe_nessy/pipeline/actions/transform_deduplication.py +9 -9
- cloe_nessy/pipeline/actions/transform_distinct.py +22 -5
- cloe_nessy/pipeline/actions/transform_filter.py +6 -6
- cloe_nessy/pipeline/actions/transform_generic_sql.py +12 -6
- cloe_nessy/pipeline/actions/transform_group_aggregate.py +20 -26
- cloe_nessy/pipeline/actions/transform_join.py +17 -10
- cloe_nessy/pipeline/actions/transform_json_normalize.py +19 -6
- cloe_nessy/pipeline/actions/transform_rename_columns.py +7 -7
- cloe_nessy/pipeline/actions/transform_replace_values.py +8 -8
- cloe_nessy/pipeline/actions/transform_select_columns.py +38 -9
- cloe_nessy/pipeline/actions/transform_union.py +12 -8
- cloe_nessy/pipeline/actions/write_catalog_table.py +11 -10
- cloe_nessy/session/session_manager.py +13 -7
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/METADATA +1 -1
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/RECORD +29 -28
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/top_level.txt +0 -0
|
@@ -46,7 +46,13 @@ class FileReader(BaseReader):
|
|
|
46
46
|
if not spark_format and not extension:
|
|
47
47
|
raise ValueError("Either spark_format or extension must be provided.")
|
|
48
48
|
self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
|
|
49
|
-
extension_to_datatype_dict = {
|
|
49
|
+
extension_to_datatype_dict = {
|
|
50
|
+
"csv": "csv",
|
|
51
|
+
"json": "json",
|
|
52
|
+
"parquet": "parquet",
|
|
53
|
+
"txt": "text",
|
|
54
|
+
"xml": "xml",
|
|
55
|
+
}
|
|
50
56
|
|
|
51
57
|
if extension and not spark_format:
|
|
52
58
|
if extension not in extension_to_datatype_dict:
|
|
@@ -20,7 +20,7 @@ class CatalogWriter:
|
|
|
20
20
|
format 'catalog.schema.table'.
|
|
21
21
|
mode: The write mode. One of append, overwrite, error, errorifexists, ignore.
|
|
22
22
|
partition_by: Names of the partitioning columns.
|
|
23
|
-
options:
|
|
23
|
+
options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
|
|
24
24
|
|
|
25
25
|
Notes:
|
|
26
26
|
append: Append contents of this DataFrame to existing data.
|
|
@@ -7,6 +7,7 @@ from .read_excel import ReadExcelAction
|
|
|
7
7
|
from .read_files import ReadFilesAction
|
|
8
8
|
from .read_metadata_yaml import ReadMetadataYAMLAction
|
|
9
9
|
from .transform_change_datatype import TransformChangeDatatypeAction
|
|
10
|
+
from .transform_clean_column_names import TransformCleanColumnNamesAction
|
|
10
11
|
from .transform_concat_columns import TransformConcatColumnsAction
|
|
11
12
|
from .transform_decode import TransformDecodeAction
|
|
12
13
|
from .transform_distinct import TransformDistinctAction
|
|
@@ -39,6 +40,7 @@ __all__ = [
|
|
|
39
40
|
"TransformFilterAction",
|
|
40
41
|
"TransformUnionAction",
|
|
41
42
|
"TransformChangeDatatypeAction",
|
|
43
|
+
"TransformCleanColumnNamesAction",
|
|
42
44
|
"TransformConcatColumnsAction",
|
|
43
45
|
"TransformDecodeAction",
|
|
44
46
|
"TransformDistinctAction",
|
|
@@ -55,51 +55,75 @@ class ReadAPIAction(PipelineAction):
|
|
|
55
55
|
DataFrame containing the response data.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
58
|
+
=== "Basic Usage"
|
|
59
|
+
```yaml
|
|
60
|
+
Read API:
|
|
61
|
+
action: READ_API
|
|
62
|
+
options:
|
|
63
|
+
base_url: https://some_url.com/api/
|
|
64
|
+
endpoint: my/endpoint/
|
|
65
|
+
```
|
|
66
|
+
=== "Usage with Parameters and Headers"
|
|
67
|
+
```yaml
|
|
68
|
+
Read API:
|
|
69
|
+
action: READ_API
|
|
70
|
+
options:
|
|
71
|
+
base_url: https://some_url.com/api/
|
|
72
|
+
endpoint: my/endpoint/
|
|
73
|
+
method: GET
|
|
74
|
+
timeout: 90
|
|
75
|
+
headers:
|
|
76
|
+
key1: value1
|
|
77
|
+
key2: value2
|
|
78
|
+
params:
|
|
79
|
+
key1: value1
|
|
80
|
+
key2: value2
|
|
81
|
+
```
|
|
82
|
+
=== "Usage with Authentication"
|
|
83
|
+
```yaml
|
|
84
|
+
Read API:
|
|
85
|
+
action: READ_API
|
|
86
|
+
options:
|
|
87
|
+
base_url: https://some_url.com/api/
|
|
88
|
+
endpoint: my/endpoint/
|
|
89
|
+
method: GET
|
|
90
|
+
timeout: 90
|
|
91
|
+
auth:
|
|
92
|
+
- type: basic
|
|
93
|
+
username: my_username
|
|
94
|
+
password: my_password
|
|
95
|
+
- type: secret_scope
|
|
96
|
+
secret_scope: my_secret_scope
|
|
97
|
+
header_template:
|
|
98
|
+
"header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
|
|
99
|
+
- type: secret_scope
|
|
100
|
+
secret_scope: my_secret_scope
|
|
101
|
+
header_template:
|
|
102
|
+
"header_key_2": "<SECRET_NAME>"
|
|
103
|
+
- type: secret_scope
|
|
104
|
+
secret_scope: my_other_secret_scope
|
|
105
|
+
header_template:
|
|
106
|
+
"header_key_3": "<SECRET_NAME>"
|
|
107
|
+
- type: azure_oauth
|
|
108
|
+
client_id: my_client_id
|
|
109
|
+
client_secret: my_client_secret
|
|
110
|
+
tenant_id: my_tenant_id
|
|
111
|
+
scope: <entra-id-client-id>
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The above example will combine the headers from the different auth types. The resulting header will look like this:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"header_key_1": "value_from_environment_variable",
|
|
119
|
+
"header_key_2": "value_from_secret",
|
|
120
|
+
"header_key_3": "value_from_secret",
|
|
121
|
+
"Authorization": "Bearer <access_token> (from azure_oauth)",
|
|
122
|
+
"Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
!!! warning "Secret information"
|
|
103
127
|
Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
|
|
104
128
|
Use secret scopes or environment variables instead.
|
|
105
129
|
"""
|
|
@@ -15,13 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
15
15
|
into a DataFrame and returned as part of an updated `PipelineContext`.
|
|
16
16
|
|
|
17
17
|
Example:
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
```yaml
|
|
19
|
+
Read Sales Table:
|
|
20
|
+
action: READ_CATALOG_TABLE
|
|
21
|
+
options:
|
|
22
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
23
|
+
options: <options for the CatalogReader read method>
|
|
24
|
+
```
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
name: str = "READ_CATALOG_TABLE"
|
|
@@ -43,8 +43,8 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
43
43
|
read. If not provided, the function will attempt to use the table
|
|
44
44
|
identifier from the `table_metadata` in the `context`.
|
|
45
45
|
options: A dictionary of options for customizing
|
|
46
|
-
the
|
|
47
|
-
to None.
|
|
46
|
+
the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
|
|
47
|
+
behavior, such as filters or reading modes. Defaults to None.
|
|
48
48
|
|
|
49
49
|
Raises:
|
|
50
50
|
ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
|
|
@@ -21,16 +21,20 @@ class ReadExcelAction(PipelineAction):
|
|
|
21
21
|
the read files can be included in the context.
|
|
22
22
|
|
|
23
23
|
Example:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
```yaml
|
|
25
|
+
Read Excel Table:
|
|
26
|
+
action: READ_EXCEL
|
|
27
|
+
options:
|
|
28
|
+
file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
|
|
29
|
+
usecols:
|
|
30
|
+
- key_column
|
|
31
|
+
- interesting_column
|
|
32
|
+
options: <options for the ExcelDataFrameReader read method>
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
!!! note "More Options"
|
|
36
|
+
The `READ_EXCEL` action supports additional options that can be passed to the
|
|
37
|
+
run method. For more information, refer to the method documentation.
|
|
34
38
|
"""
|
|
35
39
|
|
|
36
40
|
name: str = "READ_EXCEL"
|
|
@@ -14,14 +14,47 @@ class ReadFilesAction(PipelineAction):
|
|
|
14
14
|
location will be read using a DataFrameReader with the specified format.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
17
|
+
=== "Read files specified by spark_format"
|
|
18
|
+
```yaml
|
|
19
|
+
Read Files:
|
|
20
|
+
action: READ_FILES
|
|
21
|
+
options:
|
|
22
|
+
location: json_file_folder/
|
|
23
|
+
search_subdirs: True
|
|
24
|
+
spark_format: JSON
|
|
25
|
+
```
|
|
26
|
+
!!! note "Define Spark Format"
|
|
27
|
+
Use the `spark_format` option to specify the format with which
|
|
28
|
+
to read the files. Supported formats are e.g., `CSV`, `JSON`,
|
|
29
|
+
`PARQUET`, `TEXT`, and `XML`.
|
|
30
|
+
|
|
31
|
+
=== "Read files specified by extension"
|
|
32
|
+
```yaml
|
|
33
|
+
Read Files:
|
|
34
|
+
action: READ_FILES
|
|
35
|
+
options:
|
|
36
|
+
location: csv_file_folder/
|
|
37
|
+
search_subdirs: True
|
|
38
|
+
extension: csv
|
|
39
|
+
```
|
|
40
|
+
!!! note "Define Extension"
|
|
41
|
+
Use the `extension` option to specify the extension of the files
|
|
42
|
+
to read. If not specified, the `spark_format` will be derived from
|
|
43
|
+
the extension.
|
|
44
|
+
|
|
45
|
+
=== "Read files with a specified spark_format AND extension"
|
|
46
|
+
```yaml
|
|
47
|
+
Read Files:
|
|
48
|
+
action: READ_FILES
|
|
49
|
+
options:
|
|
50
|
+
location: file_folder/
|
|
51
|
+
extension: abc_custom_extension # specifies the files to read
|
|
52
|
+
spark_format: CSV # specifies the format to read the files with
|
|
53
|
+
```
|
|
54
|
+
!!! note "Define both Extension & Spark Format"
|
|
55
|
+
Use the `extension` option to specify the extension of the files
|
|
56
|
+
to read. Additionally, use the `spark_format` option to specify
|
|
57
|
+
the format with which to read the files.
|
|
25
58
|
"""
|
|
26
59
|
|
|
27
60
|
name: str = "READ_FILES"
|
|
@@ -47,7 +80,8 @@ class ReadFilesAction(PipelineAction):
|
|
|
47
80
|
search_subdirs: Recursively search subdirectories for files
|
|
48
81
|
if an extension is provided.
|
|
49
82
|
extension: The file extension to filter files by.
|
|
50
|
-
spark_format: The format to use for reading the files.
|
|
83
|
+
spark_format: The format to use for reading the files. If not provided,
|
|
84
|
+
it will be deferred from the file extension.
|
|
51
85
|
schema: The schema of the data. If None, schema is obtained from
|
|
52
86
|
the context metadata.
|
|
53
87
|
add_metadata_column: Whether to include the `__metadata` column with
|
|
@@ -65,30 +99,22 @@ class ReadFilesAction(PipelineAction):
|
|
|
65
99
|
raise ValueError("No location provided. Please specify location to read files from.")
|
|
66
100
|
if not options:
|
|
67
101
|
options = dict()
|
|
102
|
+
if not spark_format and not extension:
|
|
103
|
+
raise ValueError("Either spark_format or extension must be provided.")
|
|
68
104
|
|
|
69
105
|
if (metadata := context.table_metadata) and schema is None:
|
|
70
106
|
schema = metadata.schema
|
|
71
107
|
|
|
72
108
|
file_reader = FileReader()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
elif spark_format:
|
|
83
|
-
df = file_reader.read(
|
|
84
|
-
location=location,
|
|
85
|
-
schema=schema,
|
|
86
|
-
spark_format=spark_format,
|
|
87
|
-
options=options,
|
|
88
|
-
add_metadata_column=add_metadata_column,
|
|
89
|
-
)
|
|
90
|
-
else:
|
|
91
|
-
raise ValueError("Please provide either the 'extension' or 'spark_format'")
|
|
109
|
+
df = file_reader.read(
|
|
110
|
+
location=location,
|
|
111
|
+
schema=schema,
|
|
112
|
+
extension=extension,
|
|
113
|
+
spark_format=spark_format,
|
|
114
|
+
search_subdirs=search_subdirs,
|
|
115
|
+
options=options,
|
|
116
|
+
add_metadata_column=add_metadata_column,
|
|
117
|
+
)
|
|
92
118
|
|
|
93
119
|
runtime_info = context.runtime_info
|
|
94
120
|
|
|
@@ -10,14 +10,14 @@ class ReadMetadataYAMLAction(PipelineAction):
|
|
|
10
10
|
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
```yaml
|
|
14
|
+
Read Schema Metadata:
|
|
15
|
+
action: READ_METADATA_YAML_ACTION
|
|
16
|
+
options:
|
|
17
|
+
path: excel_file_folder/excel_files_june/
|
|
18
|
+
file_name: sales_schema.yml
|
|
19
|
+
table_name: sales
|
|
20
|
+
```
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
name: str = "READ_METADATA_YAML_ACTION"
|
|
@@ -31,7 +31,7 @@ class ReadMetadataYAMLAction(PipelineAction):
|
|
|
31
31
|
table_name: str | None = None,
|
|
32
32
|
**_: Any,
|
|
33
33
|
) -> PipelineContext:
|
|
34
|
-
"""Reads schema metadata from a yaml file using the `Schema` model.
|
|
34
|
+
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
35
35
|
|
|
36
36
|
Args:
|
|
37
37
|
context: The context in which this Action is executed.
|
|
@@ -9,15 +9,20 @@ from ..pipeline_context import PipelineContext
|
|
|
9
9
|
class TransformChangeDatatypeAction(PipelineAction):
|
|
10
10
|
"""Changes the datatypes of specified columns in the given DataFrame.
|
|
11
11
|
|
|
12
|
+
!!! note "Data Types"
|
|
13
|
+
We make use of the PySpark `cast` function to change the data types of
|
|
14
|
+
the columns. Valid data types can be found in the [PySpark
|
|
15
|
+
documentation](https://spark.apache.org/docs/3.5.3/sql-ref-datatypes.html).
|
|
16
|
+
|
|
12
17
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
```yaml
|
|
19
|
+
Cast Columns:
|
|
20
|
+
action: TRANSFORM_CHANGE_DATATYPE
|
|
21
|
+
options:
|
|
22
|
+
columns:
|
|
23
|
+
id: string
|
|
24
|
+
revenue: long
|
|
25
|
+
```
|
|
21
26
|
"""
|
|
22
27
|
|
|
23
28
|
name: str = "TRANSFORM_CHANGE_DATATYPE"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pyspark.sql.functions as F
|
|
6
|
+
import pyspark.sql.types as T
|
|
7
|
+
|
|
8
|
+
from ..pipeline_action import PipelineAction
|
|
9
|
+
from ..pipeline_context import PipelineContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TransformCleanColumnNamesAction(PipelineAction):
|
|
13
|
+
"""Fixes column names in the DataFrame to be valid.
|
|
14
|
+
|
|
15
|
+
Removes invalid characters from the column names, including the fields of a struct and
|
|
16
|
+
replaces a single leading underscore by a double underscore.
|
|
17
|
+
|
|
18
|
+
Invalid characters include:
|
|
19
|
+
- Any non-word character (anything other than letters, digits, and underscores).
|
|
20
|
+
- A single leading underscore.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
```yaml
|
|
24
|
+
Clean Column Names:
|
|
25
|
+
action: TRANSFORM_CLEAN_COLUMN_NAMES
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name: str = "TRANSFORM_CLEAN_COLUMN_NAMES"
|
|
30
|
+
|
|
31
|
+
def run(
|
|
32
|
+
self,
|
|
33
|
+
context: PipelineContext,
|
|
34
|
+
**_: Any,
|
|
35
|
+
) -> PipelineContext:
|
|
36
|
+
"""Fixes column names in the DataFrame to be valid.
|
|
37
|
+
|
|
38
|
+
Removes invalid characters from the column names, including the fields of a struct and
|
|
39
|
+
replaces a single leading underscore by a double underscore.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
context: The context in which this Action is executed.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If the data from the context is None.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The context after the execution of this Action, containing the DataFrame with cleaned column names.
|
|
49
|
+
"""
|
|
50
|
+
if context.data is None:
|
|
51
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
52
|
+
|
|
53
|
+
with_columns_renamed = {}
|
|
54
|
+
with_columns_casted: dict[str, T.StructType | T.ArrayType | T.MapType] = {}
|
|
55
|
+
|
|
56
|
+
single_underscrore_at_beginning = r"^_(?=[^_])"
|
|
57
|
+
|
|
58
|
+
for c in context.data.schema:
|
|
59
|
+
old_name = c.name
|
|
60
|
+
new_name = re.sub(single_underscrore_at_beginning, "__", re.sub("\W", "_", old_name))
|
|
61
|
+
with_columns_renamed[old_name] = new_name
|
|
62
|
+
|
|
63
|
+
if isinstance(c.dataType, (T.StructType | T.ArrayType | T.MapType)):
|
|
64
|
+
old_column_schema = c.dataType.json()
|
|
65
|
+
new_column_schema = re.sub(
|
|
66
|
+
r'(?<="name":")[^"]+',
|
|
67
|
+
lambda m: re.sub("\W", "_", str(m.group())),
|
|
68
|
+
old_column_schema,
|
|
69
|
+
)
|
|
70
|
+
if isinstance(c.dataType, T.StructType):
|
|
71
|
+
with_columns_casted[new_name] = T.StructType.fromJson(json.loads(new_column_schema))
|
|
72
|
+
elif isinstance(c.dataType, T.ArrayType):
|
|
73
|
+
with_columns_casted[new_name] = T.ArrayType.fromJson(json.loads(new_column_schema))
|
|
74
|
+
elif isinstance(c.dataType, T.MapType):
|
|
75
|
+
with_columns_casted[new_name] = T.MapType.fromJson(json.loads(new_column_schema))
|
|
76
|
+
|
|
77
|
+
df = context.data.withColumnsRenamed(with_columns_renamed)
|
|
78
|
+
for c_name, c_type in with_columns_casted.items():
|
|
79
|
+
df = df.withColumn(c_name, F.col(c_name).cast(c_type))
|
|
80
|
+
|
|
81
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -10,17 +10,31 @@ class TransformConcatColumnsAction(PipelineAction):
|
|
|
10
10
|
"""Concatenates the specified columns in the given DataFrame.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
13
|
+
=== "concat with separator"
|
|
14
|
+
```yaml
|
|
15
|
+
Concat Columns:
|
|
16
|
+
action: TRANSFORM_CONCAT_COLUMNS
|
|
17
|
+
options:
|
|
18
|
+
name: address
|
|
19
|
+
columns:
|
|
20
|
+
- street
|
|
21
|
+
- postcode
|
|
22
|
+
- country
|
|
23
|
+
separator: ', '
|
|
24
|
+
```
|
|
25
|
+
=== "concat without separator"
|
|
26
|
+
```yaml
|
|
27
|
+
Concat Column:
|
|
28
|
+
action: TRANSFORM_CONCAT_COLUMNS
|
|
29
|
+
options:
|
|
30
|
+
name: address
|
|
31
|
+
columns:
|
|
32
|
+
- street
|
|
33
|
+
- postcode
|
|
34
|
+
- country
|
|
35
|
+
```
|
|
36
|
+
!!! warning "beware of null handling"
|
|
37
|
+
The `separator` option is not provided, so the default behavior is to use `concat` which returns `NULL` if any of the concatenated values is `NULL`.
|
|
24
38
|
"""
|
|
25
39
|
|
|
26
40
|
name: str = "TRANSFORM_CONCAT_COLUMNS"
|
|
@@ -11,13 +11,24 @@ class TransformDecodeAction(PipelineAction):
|
|
|
11
11
|
"""Decodes values of a specified column in the DataFrame based on the given format.
|
|
12
12
|
|
|
13
13
|
Example:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
=== "Decode JSON column"
|
|
15
|
+
```yaml
|
|
16
|
+
Expand JSON:
|
|
17
|
+
action: "TRANSFORM_DECODE"
|
|
18
|
+
options:
|
|
19
|
+
column: "data"
|
|
20
|
+
input_format: "json"
|
|
21
|
+
schema: "quality INT, timestamp TIMESTAMP, value DOUBLE"
|
|
22
|
+
```
|
|
23
|
+
=== "Decode base64 column"
|
|
24
|
+
```yaml
|
|
25
|
+
Decode base64:
|
|
26
|
+
action: TRANSFORM_DECODE
|
|
27
|
+
options:
|
|
28
|
+
column: encoded_data
|
|
29
|
+
input_format: base64
|
|
30
|
+
schema: string
|
|
31
|
+
```
|
|
21
32
|
"""
|
|
22
33
|
|
|
23
34
|
name: str = "TRANSFORM_DECODE"
|
|
@@ -18,15 +18,15 @@ class TransformDeduplication(PipelineAction):
|
|
|
18
18
|
(can be changed to lowest by setting the parameter descending to false).
|
|
19
19
|
|
|
20
20
|
Example:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
```yaml
|
|
22
|
+
Deduplicate Columns:
|
|
23
|
+
action: TRANSFORM_DEDUPLICATION
|
|
24
|
+
options:
|
|
25
|
+
key_columns:
|
|
26
|
+
- id
|
|
27
|
+
order_by_columns:
|
|
28
|
+
- source_file_modification_time
|
|
29
|
+
```
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
name: str = "TRANSFORM_DEDUPLICATION"
|
|
@@ -7,11 +7,17 @@ from ..pipeline_context import PipelineContext
|
|
|
7
7
|
class TransformDistinctAction(PipelineAction):
|
|
8
8
|
"""Selects distinct rows from the DataFrame in the given context.
|
|
9
9
|
|
|
10
|
+
If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
|
|
11
|
+
|
|
10
12
|
Example:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
```yaml
|
|
14
|
+
Distinct Columns:
|
|
15
|
+
action: TRANSFORM_DISTINCT
|
|
16
|
+
options:
|
|
17
|
+
subset:
|
|
18
|
+
- first_name
|
|
19
|
+
- last_name
|
|
20
|
+
```
|
|
15
21
|
"""
|
|
16
22
|
|
|
17
23
|
name: str = "TRANSFORM_DISTINCT"
|
|
@@ -19,12 +25,15 @@ class TransformDistinctAction(PipelineAction):
|
|
|
19
25
|
def run(
|
|
20
26
|
self,
|
|
21
27
|
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
subset: list[str] | None = None,
|
|
22
30
|
**_: Any,
|
|
23
31
|
) -> PipelineContext:
|
|
24
32
|
"""Selects distinct rows from the DataFrame in the given context.
|
|
25
33
|
|
|
26
34
|
Args:
|
|
27
35
|
context: The context in which this Action is executed.
|
|
36
|
+
subset: List of column names to use for duplicate comparison (default All columns).
|
|
28
37
|
|
|
29
38
|
Raises:
|
|
30
39
|
ValueError: If the data from the context is None.
|
|
@@ -35,6 +44,14 @@ class TransformDistinctAction(PipelineAction):
|
|
|
35
44
|
if context.data is None:
|
|
36
45
|
raise ValueError("Data from the context is required for the operation.")
|
|
37
46
|
|
|
38
|
-
|
|
47
|
+
# check if all columns that are part of the subset are actually part of the dataframe.
|
|
48
|
+
if subset is not None:
|
|
49
|
+
subset_columns_not_in_dataframe = set(subset) - set(context.data.columns)
|
|
50
|
+
if len(subset_columns_not_in_dataframe) != 0:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"The following subset columns are not part of the dataframe: {subset_columns_not_in_dataframe}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
df = context.data.dropDuplicates(subset=subset)
|
|
39
56
|
|
|
40
57
|
return context.from_existing(data=df) # type: ignore
|
|
@@ -8,12 +8,12 @@ class TransformFilterAction(PipelineAction):
|
|
|
8
8
|
"""Filters the DataFrame in the given context based on a specified condition.
|
|
9
9
|
|
|
10
10
|
Example:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
```yaml
|
|
12
|
+
Filter Columns:
|
|
13
|
+
action: TRANSFORM_FILTER
|
|
14
|
+
options:
|
|
15
|
+
condition: city="Hamburg"
|
|
16
|
+
```
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
name: str = "TRANSFORM_FILTER"
|
|
@@ -13,12 +13,18 @@ class TransformSqlAction(PipelineAction):
|
|
|
13
13
|
statement is executed on that view. The resulting DataFrame is returned.
|
|
14
14
|
|
|
15
15
|
Example:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
```yaml
|
|
17
|
+
SQL Transform:
|
|
18
|
+
action: TRANSFORM_SQL
|
|
19
|
+
options:
|
|
20
|
+
sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
|
|
21
|
+
```
|
|
22
|
+
!!! note
|
|
23
|
+
The SQL statement should reference the DataFrame as "{DATA_FRAME}".
|
|
24
|
+
This nessy specific placeholder will be replaced with your input
|
|
25
|
+
DataFrame from the context. If your pipeline is defined as an
|
|
26
|
+
f-string, you can escape the curly braces by doubling them, e.g.,
|
|
27
|
+
"{{DATA_FRAME}}".
|
|
22
28
|
"""
|
|
23
29
|
|
|
24
30
|
name: str = "TRANSFORM_SQL"
|
|
@@ -13,33 +13,27 @@ class TransformGroupAggregate(PipelineAction):
|
|
|
13
13
|
to other columns. The aggregation functions can be specified as a dictionary where keys are column names
|
|
14
14
|
and values are either a single aggregation function or a list of functions.
|
|
15
15
|
|
|
16
|
+
The output DataFrame will contain the grouped columns and the aggregated columns with the aggregation
|
|
17
|
+
function as a prefix to the column name.
|
|
18
|
+
|
|
16
19
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
Methods:
|
|
35
|
-
run(context, grouping_columns=None, aggregations=None, **_):
|
|
36
|
-
Executes the aggregation on the grouped data.
|
|
37
|
-
|
|
38
|
-
Raises:
|
|
39
|
-
ValueError: If the context data is None.
|
|
40
|
-
ValueError: If no aggregations are provided.
|
|
41
|
-
ValueError: If invalid aggregation operations are provided.
|
|
42
|
-
ValueError: If columns with unsupported data types are included in the aggregations.
|
|
20
|
+
```yaml
|
|
21
|
+
Transform Group Aggregate:
|
|
22
|
+
action: TRANSFORM_GROUP_AGGREGATE
|
|
23
|
+
options:
|
|
24
|
+
grouping_columns:
|
|
25
|
+
- column1
|
|
26
|
+
- column2
|
|
27
|
+
aggregations:
|
|
28
|
+
column3:
|
|
29
|
+
- sum
|
|
30
|
+
- avg
|
|
31
|
+
column4: max
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
This example groups the DataFrame by `column1` and `column2` and aggregates `column3` by sum and average
|
|
35
|
+
and `column4` by max. The resulting DataFrame will contain the grouped columns `column1` and `column2`
|
|
36
|
+
and the aggregated columns `sum_column3`, `avg_column3`, and `max_column4`.
|
|
43
37
|
"""
|
|
44
38
|
|
|
45
39
|
name: str = "TRANSFORM_GROUP_AGGREGATE"
|
|
@@ -8,18 +8,25 @@ from ..pipeline_step import PipelineStep
|
|
|
8
8
|
class TransformJoinAction(PipelineAction):
|
|
9
9
|
"""Joins the current DataFrame with another DataFrame defined in joined_data.
|
|
10
10
|
|
|
11
|
-
The join operation is performed based on specified columns and the type of
|
|
12
|
-
indicated by the `how` parameter.
|
|
11
|
+
The join operation is performed based on specified columns and the type of
|
|
12
|
+
join indicated by the `how` parameter. Supported join types can be taken
|
|
13
|
+
from [PySpark
|
|
14
|
+
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
|
|
13
15
|
|
|
14
16
|
Example:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
```yaml
|
|
18
|
+
Join Tables:
|
|
19
|
+
action: TRANSFORM_JOIN
|
|
20
|
+
options:
|
|
21
|
+
joined_data: ((step:Transform First Table))
|
|
22
|
+
join_on: id
|
|
23
|
+
how: anti
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
!!! note "Referencing a DataFrame from another step"
|
|
27
|
+
The `joined_data` parameter is a reference to the DataFrame from another step.
|
|
28
|
+
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
29
|
+
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
23
30
|
"""
|
|
24
31
|
|
|
25
32
|
name: str = "TRANSFORM_JOIN"
|
|
@@ -14,12 +14,25 @@ class TransformJsonNormalize(PipelineAction):
|
|
|
14
14
|
structs are appended after existing columns.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
```yaml
|
|
18
|
+
Normalize Tables:
|
|
19
|
+
action: TRANSFORM_JSON_NORMALIZE
|
|
20
|
+
options:
|
|
21
|
+
exclude_columns: coordinates
|
|
22
|
+
```
|
|
23
|
+
Example Input Data:
|
|
24
|
+
|
|
25
|
+
| id | name | coordinates | attributes |
|
|
26
|
+
|----|--------|----------------------|---------------------------|
|
|
27
|
+
| 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
|
|
28
|
+
| 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
|
|
29
|
+
|
|
30
|
+
Example Output Data:
|
|
31
|
+
|
|
32
|
+
| id | name | coordinates | attributes_age | attributes_city |
|
|
33
|
+
|----|--------|-------------|----------------|-----------------|
|
|
34
|
+
| 1 | Alice | [10.0, 20.0]| 30 | NY |
|
|
35
|
+
| 2 | Bob | [30.0, 40.0]| 25 | LA |
|
|
23
36
|
"""
|
|
24
37
|
|
|
25
38
|
name: str = "TRANSFORM_JSON_NORMALIZE"
|
|
@@ -12,13 +12,13 @@ class TransformRenameColumnsAction(PipelineAction):
|
|
|
12
12
|
name and its corresponding value represents the new column name.
|
|
13
13
|
|
|
14
14
|
Example:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
15
|
+
```yaml
|
|
16
|
+
Rename Column:
|
|
17
|
+
action: TRANSFORM_RENAME_COLUMNS
|
|
18
|
+
options:
|
|
19
|
+
columns:
|
|
20
|
+
a_very_long_column_name: shortname
|
|
21
|
+
```
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
name: str = "TRANSFORM_RENAME_COLUMNS"
|
|
@@ -13,14 +13,14 @@ class TransformReplaceValuesAction(PipelineAction):
|
|
|
13
13
|
in the specified columns.
|
|
14
14
|
|
|
15
15
|
Example:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
```yaml
|
|
17
|
+
Replace Values:
|
|
18
|
+
action: TRANSFORM_REPLACE_VALUES
|
|
19
|
+
options:
|
|
20
|
+
replace:
|
|
21
|
+
empl_function:
|
|
22
|
+
sales_employee: seller
|
|
23
|
+
```
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
name: str = "TRANSFORM_REPLACE_VALUES"
|
|
@@ -14,15 +14,44 @@ class TransformSelectColumnsAction(PipelineAction):
|
|
|
14
14
|
DataFrame before performing the selection.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
Example Input Data:
|
|
18
|
+
|
|
19
|
+
| id | name | coordinates | attributes |
|
|
20
|
+
|----|--------|----------------------|---------------------------|
|
|
21
|
+
| 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
|
|
22
|
+
| 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
|
|
23
|
+
=== "Include Columns"
|
|
24
|
+
```yaml
|
|
25
|
+
Select Columns:
|
|
26
|
+
action: TRANSFORM_SELECT_COLUMNS
|
|
27
|
+
options:
|
|
28
|
+
include_columns:
|
|
29
|
+
- id
|
|
30
|
+
- name
|
|
31
|
+
- coordinates
|
|
32
|
+
```
|
|
33
|
+
Example Output Data:
|
|
34
|
+
|
|
35
|
+
| id | name | coordinates |
|
|
36
|
+
|----|--------|----------------------|
|
|
37
|
+
| 1 | Alice | [10.0, 20.0] |
|
|
38
|
+
| 2 | Bob | [30.0, 40.0] |
|
|
39
|
+
|
|
40
|
+
=== "Exclude Columns"
|
|
41
|
+
```yaml
|
|
42
|
+
Select Columns:
|
|
43
|
+
action: TRANSFORM_SELECT_COLUMNS
|
|
44
|
+
options:
|
|
45
|
+
exclude_columns:
|
|
46
|
+
- coordinates
|
|
47
|
+
```
|
|
48
|
+
Example Output Data:
|
|
49
|
+
|
|
50
|
+
| id | name | attributes |
|
|
51
|
+
|----|--------|---------------------------|
|
|
52
|
+
| 1 | Alice | {"age": 30, "city": "NY"} |
|
|
53
|
+
| 2 | Bob | {"age": 25, "city": "LA"} |
|
|
54
|
+
|
|
26
55
|
"""
|
|
27
56
|
|
|
28
57
|
name: str = "TRANSFORM_SELECT_COLUMNS"
|
|
@@ -17,14 +17,18 @@ class TransformUnionAction(PipelineAction):
|
|
|
17
17
|
empty, a ValueError will be raised.
|
|
18
18
|
|
|
19
19
|
Example:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
```yaml
|
|
21
|
+
Union Tables:
|
|
22
|
+
action: TRANSFORM_UNION
|
|
23
|
+
options:
|
|
24
|
+
union_data:
|
|
25
|
+
- ((step: Filter First Table))
|
|
26
|
+
- ((step: SQL Transform Second Table))
|
|
27
|
+
```
|
|
28
|
+
!!! note "Referencing a DataFrame from another step"
|
|
29
|
+
The `union_data` parameter is a reference to the DataFrame from another step.
|
|
30
|
+
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
31
|
+
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
28
32
|
"""
|
|
29
33
|
|
|
30
34
|
name: str = "TRANSFORM_UNION"
|
|
@@ -9,15 +9,16 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
9
9
|
"""Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
|
|
10
10
|
|
|
11
11
|
Example:
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
12
|
+
```yaml
|
|
13
|
+
Write Table to Catalog:
|
|
14
|
+
action: WRITE_CATALOG_TABLE
|
|
15
|
+
options:
|
|
16
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
17
|
+
mode: append
|
|
18
|
+
partition_by: day
|
|
19
|
+
options:
|
|
20
|
+
mergeSchema: true
|
|
21
|
+
```
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
name: str = "WRITE_CATALOG_TABLE"
|
|
@@ -42,7 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
42
43
|
mode: The write mode. One of 'append', 'overwrite', 'error',
|
|
43
44
|
'errorifexists', or 'ignore'.
|
|
44
45
|
partition_by: Names of the partitioning columns.
|
|
45
|
-
options:
|
|
46
|
+
options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
|
|
46
47
|
|
|
47
48
|
Raises:
|
|
48
49
|
ValueError: If the table name is not specified or cannot be inferred from
|
|
@@ -63,13 +63,13 @@ class SessionManager:
|
|
|
63
63
|
@classmethod
|
|
64
64
|
def get_utils(
|
|
65
65
|
cls,
|
|
66
|
-
) -> Any: # return type should be Union[DBUtils, MsSparkUtils
|
|
67
|
-
"""Get or create a DBUtils or MsSparkUtils instance, depending on the context.
|
|
66
|
+
) -> Any: # return type should be Union[DBUtils, MsSparkUtils, RemoteDbUtils].
|
|
67
|
+
"""Get or create a DBUtils, RemoteDbUtils or MsSparkUtils instance, depending on the context.
|
|
68
68
|
|
|
69
|
-
In Databricks this will return DBUtils,
|
|
69
|
+
In Databricks this will return DBUtils, when using Databricks-Connect it returns RemoteDbUtils, and in Fabric it will return MsSparkUtils.
|
|
70
70
|
|
|
71
71
|
Returns:
|
|
72
|
-
utils: The DBUtils or MsSparkUtils instance.
|
|
72
|
+
utils: The DBUtils, RemoteDbUtils or MsSparkUtils instance.
|
|
73
73
|
|
|
74
74
|
Raises:
|
|
75
75
|
RuntimeError: If the instance cannot be created.
|
|
@@ -88,19 +88,25 @@ class SessionManager:
|
|
|
88
88
|
}
|
|
89
89
|
|
|
90
90
|
try:
|
|
91
|
-
cls._utils = utils_function[cls._env](
|
|
91
|
+
cls._utils = utils_function[cls._env]() # type: ignore
|
|
92
92
|
except Exception as e:
|
|
93
93
|
raise RuntimeError(f"Cannot create utils instance. Error: {e}") from e
|
|
94
94
|
|
|
95
95
|
return cls._utils
|
|
96
96
|
|
|
97
|
+
@classmethod
|
|
97
98
|
def _get_dbutils(cls):
|
|
99
|
+
if cls._env == cls.Environment.DATABRICKS_CONNECT:
|
|
100
|
+
from databricks.sdk import WorkspaceClient
|
|
101
|
+
|
|
102
|
+
return WorkspaceClient().dbutils
|
|
103
|
+
|
|
98
104
|
from pyspark.dbutils import DBUtils
|
|
99
105
|
|
|
100
106
|
cls.get_spark_session()
|
|
101
|
-
|
|
102
|
-
return utils
|
|
107
|
+
return DBUtils(cls._spark)
|
|
103
108
|
|
|
109
|
+
@classmethod
|
|
104
110
|
def _get_mssparkutils(cls):
|
|
105
111
|
from notebookutils import mssparkutils # type: ignore
|
|
106
112
|
|
|
@@ -22,10 +22,10 @@ cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFM
|
|
|
22
22
|
cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
|
|
23
23
|
cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
|
|
24
24
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
25
|
-
cloe_nessy/integration/reader/file_reader.py,sha256=
|
|
25
|
+
cloe_nessy/integration/reader/file_reader.py,sha256=1os8pZIXAGTJBZjGREmHOTlZeabbikC7sDv5xn3bIjE,3950
|
|
26
26
|
cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
|
|
27
27
|
cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
|
|
28
|
-
cloe_nessy/integration/writer/catalog_writer.py,sha256=
|
|
28
|
+
cloe_nessy/integration/writer/catalog_writer.py,sha256=Gb-hMdADgO_uUJ7mZPHBYyNme2qXsdFFnzwo7GcShHM,2192
|
|
29
29
|
cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
|
|
30
30
|
cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
|
|
31
31
|
cloe_nessy/models/__init__.py,sha256=_JPN_R5-QDfjYzvrvZDdeOezl0C-JTG-Rk4S1VE5vJM,242
|
|
@@ -47,34 +47,35 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
|
|
|
47
47
|
cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
|
|
48
48
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
|
|
49
49
|
cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
|
|
50
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
51
|
-
cloe_nessy/pipeline/actions/read_api.py,sha256=
|
|
52
|
-
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256
|
|
53
|
-
cloe_nessy/pipeline/actions/read_excel.py,sha256=
|
|
54
|
-
cloe_nessy/pipeline/actions/read_files.py,sha256=
|
|
55
|
-
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=
|
|
56
|
-
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=
|
|
57
|
-
cloe_nessy/pipeline/actions/
|
|
58
|
-
cloe_nessy/pipeline/actions/
|
|
59
|
-
cloe_nessy/pipeline/actions/
|
|
60
|
-
cloe_nessy/pipeline/actions/
|
|
61
|
-
cloe_nessy/pipeline/actions/
|
|
62
|
-
cloe_nessy/pipeline/actions/
|
|
63
|
-
cloe_nessy/pipeline/actions/
|
|
64
|
-
cloe_nessy/pipeline/actions/
|
|
65
|
-
cloe_nessy/pipeline/actions/
|
|
66
|
-
cloe_nessy/pipeline/actions/
|
|
67
|
-
cloe_nessy/pipeline/actions/
|
|
68
|
-
cloe_nessy/pipeline/actions/
|
|
69
|
-
cloe_nessy/pipeline/actions/
|
|
70
|
-
cloe_nessy/pipeline/actions/
|
|
50
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=LwKctXy4Jun52BnCVGvWa8nnKVjTSov4GT58j6Zy8zg,2273
|
|
51
|
+
cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nuAHCuSaGs2s,7778
|
|
52
|
+
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=-k2wezkv8bE_xwoW7WM1ORhrCXQagKTUuXkhI2ZEROs,2783
|
|
53
|
+
cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
|
|
54
|
+
cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
|
|
55
|
+
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
|
|
56
|
+
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
|
|
57
|
+
cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
|
|
58
|
+
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
|
|
59
|
+
cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
|
|
60
|
+
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
|
|
61
|
+
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
|
|
62
|
+
cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1QeatjdEis0up4I7cOWBdyo,1446
|
|
63
|
+
cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
|
|
64
|
+
cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
|
|
65
|
+
cloe_nessy/pipeline/actions/transform_join.py,sha256=e_tvMk8YJTAWcUK_EmOgNt0s31ICZoMX_MKOTWx4lBY,3645
|
|
66
|
+
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
|
|
67
|
+
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
|
|
68
|
+
cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
|
|
69
|
+
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
70
|
+
cloe_nessy/pipeline/actions/transform_union.py,sha256=s81Vge0AbYPc7VkskCYfOQ_LEjqcmfNFyDkytfjcZyo,2720
|
|
71
|
+
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
|
|
71
72
|
cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
|
|
72
|
-
cloe_nessy/session/session_manager.py,sha256=
|
|
73
|
+
cloe_nessy/session/session_manager.py,sha256=PK7awMc6fmot7f9FMmvIUbIzKFgjcy2o2bZS9kjVs10,6733
|
|
73
74
|
cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
|
|
74
75
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
75
76
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
77
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
77
|
-
cloe_nessy-0.3.
|
|
78
|
-
cloe_nessy-0.3.
|
|
79
|
-
cloe_nessy-0.3.
|
|
80
|
-
cloe_nessy-0.3.
|
|
78
|
+
cloe_nessy-0.3.5.dist-info/METADATA,sha256=UUx3aIUgvCLn7j3H4DbCL1k9-47HPKaANiMQsUj66wo,1837
|
|
79
|
+
cloe_nessy-0.3.5.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
80
|
+
cloe_nessy-0.3.5.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
81
|
+
cloe_nessy-0.3.5.dist-info/RECORD,,
|
|
File without changes
|