cloe-nessy 0.3.3__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +0 -0
- cloe_nessy/clients/api_client/__init__.py +0 -0
- cloe_nessy/clients/api_client/api_client.py +0 -0
- cloe_nessy/clients/api_client/api_response.py +0 -0
- cloe_nessy/clients/api_client/auth.py +0 -0
- cloe_nessy/clients/api_client/exceptions.py +0 -0
- cloe_nessy/file_utilities/__init__.py +0 -0
- cloe_nessy/file_utilities/exceptions.py +0 -0
- cloe_nessy/file_utilities/factory.py +0 -0
- cloe_nessy/file_utilities/get_file_paths.py +0 -0
- cloe_nessy/file_utilities/location_types.py +0 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +0 -0
- cloe_nessy/integration/reader/api_reader.py +0 -0
- cloe_nessy/integration/reader/catalog_reader.py +0 -0
- cloe_nessy/integration/reader/excel_reader.py +0 -0
- cloe_nessy/integration/reader/exceptions.py +0 -0
- cloe_nessy/integration/reader/file_reader.py +7 -1
- cloe_nessy/integration/reader/reader.py +0 -0
- cloe_nessy/integration/writer/__init__.py +0 -0
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/logging/__init__.py +0 -0
- cloe_nessy/logging/logger_mixin.py +0 -0
- cloe_nessy/models/__init__.py +4 -0
- cloe_nessy/models/adapter/__init__.py +3 -0
- cloe_nessy/models/adapter/unity_catalog_adapter.py +292 -0
- cloe_nessy/models/catalog.py +10 -0
- cloe_nessy/models/column.py +0 -0
- cloe_nessy/models/constraint.py +0 -0
- cloe_nessy/models/foreign_key.py +0 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
- cloe_nessy/models/schema.py +19 -0
- cloe_nessy/models/table.py +50 -5
- cloe_nessy/models/types.py +0 -0
- cloe_nessy/models/volume.py +67 -0
- cloe_nessy/object_manager/__init__.py +7 -2
- cloe_nessy/object_manager/table_manager.py +183 -7
- cloe_nessy/object_manager/volume_manager.py +70 -0
- cloe_nessy/pipeline/__init__.py +0 -0
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/read_api.py +69 -45
- cloe_nessy/pipeline/actions/read_catalog_table.py +9 -9
- cloe_nessy/pipeline/actions/read_excel.py +14 -10
- cloe_nessy/pipeline/actions/read_files.py +54 -28
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +9 -9
- cloe_nessy/pipeline/actions/transform_change_datatype.py +13 -8
- cloe_nessy/pipeline/actions/transform_clean_column_names.py +4 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +25 -11
- cloe_nessy/pipeline/actions/transform_decode.py +18 -7
- cloe_nessy/pipeline/actions/transform_deduplication.py +9 -9
- cloe_nessy/pipeline/actions/transform_distinct.py +8 -8
- cloe_nessy/pipeline/actions/transform_filter.py +6 -6
- cloe_nessy/pipeline/actions/transform_generic_sql.py +12 -6
- cloe_nessy/pipeline/actions/transform_group_aggregate.py +20 -26
- cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
- cloe_nessy/pipeline/actions/transform_join.py +17 -10
- cloe_nessy/pipeline/actions/transform_json_normalize.py +19 -6
- cloe_nessy/pipeline/actions/transform_rename_columns.py +7 -7
- cloe_nessy/pipeline/actions/transform_replace_values.py +8 -8
- cloe_nessy/pipeline/actions/transform_select_columns.py +38 -9
- cloe_nessy/pipeline/actions/transform_union.py +12 -8
- cloe_nessy/pipeline/actions/write_catalog_table.py +11 -10
- cloe_nessy/pipeline/pipeline.py +44 -2
- cloe_nessy/pipeline/pipeline_action.py +0 -0
- cloe_nessy/pipeline/pipeline_config.py +0 -0
- cloe_nessy/pipeline/pipeline_context.py +0 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
- cloe_nessy/pipeline/pipeline_step.py +0 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +0 -0
- cloe_nessy/session/session_manager.py +27 -0
- cloe_nessy/settings/__init__.py +0 -0
- cloe_nessy/settings/settings.py +0 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +0 -0
- cloe_nessy-0.3.8.dist-info/METADATA +46 -0
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/RECORD +41 -35
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/top_level.txt +0 -0
- cloe_nessy-0.3.3.dist-info/METADATA +0 -26
|
@@ -10,17 +10,31 @@ class TransformConcatColumnsAction(PipelineAction):
|
|
|
10
10
|
"""Concatenates the specified columns in the given DataFrame.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
13
|
+
=== "concat with separator"
|
|
14
|
+
```yaml
|
|
15
|
+
Concat Columns:
|
|
16
|
+
action: TRANSFORM_CONCAT_COLUMNS
|
|
17
|
+
options:
|
|
18
|
+
name: address
|
|
19
|
+
columns:
|
|
20
|
+
- street
|
|
21
|
+
- postcode
|
|
22
|
+
- country
|
|
23
|
+
separator: ', '
|
|
24
|
+
```
|
|
25
|
+
=== "concat without separator"
|
|
26
|
+
```yaml
|
|
27
|
+
Concat Column:
|
|
28
|
+
action: TRANSFORM_CONCAT_COLUMNS
|
|
29
|
+
options:
|
|
30
|
+
name: address
|
|
31
|
+
columns:
|
|
32
|
+
- street
|
|
33
|
+
- postcode
|
|
34
|
+
- country
|
|
35
|
+
```
|
|
36
|
+
!!! warning "beware of null handling"
|
|
37
|
+
The `separator` option is not provided, so the default behavior is to use `concat` which returns `NULL` if any of the concatenated values is `NULL`.
|
|
24
38
|
"""
|
|
25
39
|
|
|
26
40
|
name: str = "TRANSFORM_CONCAT_COLUMNS"
|
|
@@ -11,13 +11,24 @@ class TransformDecodeAction(PipelineAction):
|
|
|
11
11
|
"""Decodes values of a specified column in the DataFrame based on the given format.
|
|
12
12
|
|
|
13
13
|
Example:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
=== "Decode JSON column"
|
|
15
|
+
```yaml
|
|
16
|
+
Expand JSON:
|
|
17
|
+
action: "TRANSFORM_DECODE"
|
|
18
|
+
options:
|
|
19
|
+
column: "data"
|
|
20
|
+
input_format: "json"
|
|
21
|
+
schema: "quality INT, timestamp TIMESTAMP, value DOUBLE"
|
|
22
|
+
```
|
|
23
|
+
=== "Decode base64 column"
|
|
24
|
+
```yaml
|
|
25
|
+
Decode base64:
|
|
26
|
+
action: TRANSFORM_DECODE
|
|
27
|
+
options:
|
|
28
|
+
column: encoded_data
|
|
29
|
+
input_format: base64
|
|
30
|
+
schema: string
|
|
31
|
+
```
|
|
21
32
|
"""
|
|
22
33
|
|
|
23
34
|
name: str = "TRANSFORM_DECODE"
|
|
@@ -18,15 +18,15 @@ class TransformDeduplication(PipelineAction):
|
|
|
18
18
|
(can be changed to lowest by setting the parameter descending to false).
|
|
19
19
|
|
|
20
20
|
Example:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
```yaml
|
|
22
|
+
Deduplicate Columns:
|
|
23
|
+
action: TRANSFORM_DEDUPLICATION
|
|
24
|
+
options:
|
|
25
|
+
key_columns:
|
|
26
|
+
- id
|
|
27
|
+
order_by_columns:
|
|
28
|
+
- source_file_modification_time
|
|
29
|
+
```
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
name: str = "TRANSFORM_DEDUPLICATION"
|
|
@@ -10,14 +10,14 @@ class TransformDistinctAction(PipelineAction):
|
|
|
10
10
|
If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
```yaml
|
|
14
|
+
Distinct Columns:
|
|
15
|
+
action: TRANSFORM_DISTINCT
|
|
16
|
+
options:
|
|
17
|
+
subset:
|
|
18
|
+
- first_name
|
|
19
|
+
- last_name
|
|
20
|
+
```
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
name: str = "TRANSFORM_DISTINCT"
|
|
@@ -8,12 +8,12 @@ class TransformFilterAction(PipelineAction):
|
|
|
8
8
|
"""Filters the DataFrame in the given context based on a specified condition.
|
|
9
9
|
|
|
10
10
|
Example:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
```yaml
|
|
12
|
+
Filter Columns:
|
|
13
|
+
action: TRANSFORM_FILTER
|
|
14
|
+
options:
|
|
15
|
+
condition: city="Hamburg"
|
|
16
|
+
```
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
name: str = "TRANSFORM_FILTER"
|
|
@@ -13,12 +13,18 @@ class TransformSqlAction(PipelineAction):
|
|
|
13
13
|
statement is executed on that view. The resulting DataFrame is returned.
|
|
14
14
|
|
|
15
15
|
Example:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
```yaml
|
|
17
|
+
SQL Transform:
|
|
18
|
+
action: TRANSFORM_SQL
|
|
19
|
+
options:
|
|
20
|
+
sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
|
|
21
|
+
```
|
|
22
|
+
!!! note
|
|
23
|
+
The SQL statement should reference the DataFrame as "{DATA_FRAME}".
|
|
24
|
+
This nessy specific placeholder will be replaced with your input
|
|
25
|
+
DataFrame from the context. If your pipeline is defined as an
|
|
26
|
+
f-string, you can escape the curly braces by doubling them, e.g.,
|
|
27
|
+
"{{DATA_FRAME}}".
|
|
22
28
|
"""
|
|
23
29
|
|
|
24
30
|
name: str = "TRANSFORM_SQL"
|
|
@@ -13,33 +13,27 @@ class TransformGroupAggregate(PipelineAction):
|
|
|
13
13
|
to other columns. The aggregation functions can be specified as a dictionary where keys are column names
|
|
14
14
|
and values are either a single aggregation function or a list of functions.
|
|
15
15
|
|
|
16
|
+
The output DataFrame will contain the grouped columns and the aggregated columns with the aggregation
|
|
17
|
+
function as a prefix to the column name.
|
|
18
|
+
|
|
16
19
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
Methods:
|
|
35
|
-
run(context, grouping_columns=None, aggregations=None, **_):
|
|
36
|
-
Executes the aggregation on the grouped data.
|
|
37
|
-
|
|
38
|
-
Raises:
|
|
39
|
-
ValueError: If the context data is None.
|
|
40
|
-
ValueError: If no aggregations are provided.
|
|
41
|
-
ValueError: If invalid aggregation operations are provided.
|
|
42
|
-
ValueError: If columns with unsupported data types are included in the aggregations.
|
|
20
|
+
```yaml
|
|
21
|
+
Transform Group Aggregate:
|
|
22
|
+
action: TRANSFORM_GROUP_AGGREGATE
|
|
23
|
+
options:
|
|
24
|
+
grouping_columns:
|
|
25
|
+
- column1
|
|
26
|
+
- column2
|
|
27
|
+
aggregations:
|
|
28
|
+
column3:
|
|
29
|
+
- sum
|
|
30
|
+
- avg
|
|
31
|
+
column4: max
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
This example groups the DataFrame by `column1` and `column2` and aggregates `column3` by sum and average
|
|
35
|
+
and `column4` by max. The resulting DataFrame will contain the grouped columns `column1` and `column2`
|
|
36
|
+
and the aggregated columns `sum_column3`, `avg_column3`, and `max_column4`.
|
|
43
37
|
"""
|
|
44
38
|
|
|
45
39
|
name: str = "TRANSFORM_GROUP_AGGREGATE"
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
|
4
|
+
from pyspark.sql import functions as F
|
|
5
|
+
|
|
6
|
+
from ..pipeline_action import PipelineAction
|
|
7
|
+
from ..pipeline_context import PipelineContext
|
|
8
|
+
|
|
9
|
+
SUPPORTED_ALGORITHMS = {"hash", "md5", "sha1", "sha2", "xxhash64", "crc32"}
|
|
10
|
+
VALID_SHA2_BITS = {224, 256, 384, 512}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HashSettings(BaseModel):
|
|
14
|
+
"""Represents the settings for hashing columns.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
columns: List of column names to hash.
|
|
18
|
+
algorithm: Hashing algorithm to use. Must be one of
|
|
19
|
+
"hash", "md5", "sha1", "sha2", "xxhash64", or "crc32".
|
|
20
|
+
bits: Bit length for the 'sha2' algorithm. Optional.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
columns: list[str]
|
|
24
|
+
algorithm: str = Field(..., description="Hashing algorithm to use")
|
|
25
|
+
bits: int | None = Field(default=None, description="Only required for sha2")
|
|
26
|
+
|
|
27
|
+
@model_validator(mode="before")
|
|
28
|
+
def validate_all(cls, values):
|
|
29
|
+
"""Validates the input values for a hashing operation before model instantiation.
|
|
30
|
+
|
|
31
|
+
This method performs the following checks:
|
|
32
|
+
|
|
33
|
+
1. Ensures the specified hashing algorithm is supported.
|
|
34
|
+
2. Validates that at least one column is provided and that the columns parameter is a non-empty list.
|
|
35
|
+
3. Checks that hashing multiple columns is only supported for the 'hash' and 'xxhash64' algorithms.
|
|
36
|
+
4. For the 'sha2' algorithm, ensures that the 'bits' parameter is one of the valid options.
|
|
37
|
+
5. Ensures that the 'bits' parameter is not provided for algorithms other than 'sha2'.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If the algorithm is unsupported, no columns are provided, the columns parameter is invalid,
|
|
41
|
+
or the 'bits' parameter is invalid for the specified algorithm.
|
|
42
|
+
NotImplementedError: If multiple columns are provided and the algorithm does not support hashing multiple columns.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
cls: The class being validated.
|
|
46
|
+
values: A dictionary of input values containing 'algorithm', 'columns', and 'bits'.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
The validated input values.
|
|
50
|
+
"""
|
|
51
|
+
algorithm = values.get("algorithm")
|
|
52
|
+
columns = values.get("columns")
|
|
53
|
+
bits = values.get("bits")
|
|
54
|
+
|
|
55
|
+
if algorithm not in SUPPORTED_ALGORITHMS:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Unsupported hashing algorithm '{algorithm}'. Supported algorithms are: {SUPPORTED_ALGORITHMS}."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if not columns or not isinstance(columns, list) or len(columns) == 0:
|
|
61
|
+
raise ValueError("At least one column must be provided.")
|
|
62
|
+
|
|
63
|
+
if len(columns) > 1 and algorithm not in {"hash", "xxhash64"}:
|
|
64
|
+
raise NotImplementedError(
|
|
65
|
+
f"Hashing multiple columns is only supported for 'hash' and 'xxhash64'. Algorithm '{algorithm}' does not support this."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if algorithm == "sha2":
|
|
69
|
+
if bits not in VALID_SHA2_BITS:
|
|
70
|
+
raise ValueError(f"'bits' must be one of {VALID_SHA2_BITS} when using 'sha2'.")
|
|
71
|
+
elif bits is not None:
|
|
72
|
+
raise ValueError("'bits' is only allowed when algorithm is 'sha2'.")
|
|
73
|
+
|
|
74
|
+
return values
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class HashConfig(BaseModel):
|
|
78
|
+
"""A configuration model for defining hash settings for specific columns.
|
|
79
|
+
|
|
80
|
+
Attributes:
|
|
81
|
+
hash_config: A dictionary where the keys are column names
|
|
82
|
+
(as strings) and the values are `HashSettings` objects that define
|
|
83
|
+
the hash settings for each column.
|
|
84
|
+
|
|
85
|
+
Methods:
|
|
86
|
+
validate_config: Validates the hash configuration to ensure it contains
|
|
87
|
+
at least one entry and that all column names are valid strings. Raises a
|
|
88
|
+
`ValueError` if the configuration is invalid.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
hash_config: dict[str, HashSettings]
|
|
92
|
+
|
|
93
|
+
@model_validator(mode="before")
|
|
94
|
+
def validate_config(cls, values):
|
|
95
|
+
"""Validates the hash configuration provided in the model.
|
|
96
|
+
|
|
97
|
+
This method is executed in "before" mode to ensure that the `hash_config`
|
|
98
|
+
field in the input values meets the required criteria:
|
|
99
|
+
|
|
100
|
+
- It must be a dictionary.
|
|
101
|
+
- It must contain at least one entry.
|
|
102
|
+
- Each key in the dictionary must be a non-empty string.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: If `hash_config` is missing, not a dictionary, empty, or
|
|
106
|
+
contains invalid column names.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
cls: The class to which this validator is applied.
|
|
110
|
+
values: The input values to validate.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
The validated input values.
|
|
114
|
+
"""
|
|
115
|
+
config = values.get("hash_config")
|
|
116
|
+
if not config or not isinstance(config, dict) or len(config) == 0:
|
|
117
|
+
raise ValueError("Hash configuration must contain at least one entry.")
|
|
118
|
+
for new_col in config:
|
|
119
|
+
if not new_col or not isinstance(new_col, str):
|
|
120
|
+
raise ValueError(f"Invalid column name '{new_col}' in hash configuration.")
|
|
121
|
+
return values
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class TransformHashColumnsAction(PipelineAction):
|
|
125
|
+
"""Hashes specified columns in a DataFrame using a chosen algorithm.
|
|
126
|
+
|
|
127
|
+
Given the following `hash_config`:
|
|
128
|
+
|
|
129
|
+
Example:
|
|
130
|
+
```yaml
|
|
131
|
+
Hash Columns:
|
|
132
|
+
action: TRANSFORM_HASH_COLUMNS
|
|
133
|
+
options:
|
|
134
|
+
hash_config:
|
|
135
|
+
- hashed_column1:
|
|
136
|
+
columns: ["column1", "column2"]
|
|
137
|
+
algorithm: "sha2"
|
|
138
|
+
bits: 224
|
|
139
|
+
- hashed_column2:
|
|
140
|
+
columns: ["column1"]
|
|
141
|
+
algorithm: "crc32"
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Given a DataFrame `df` with the following structure:
|
|
145
|
+
|
|
146
|
+
| column1 | column2 | column3 |
|
|
147
|
+
|---------|---------|---------|
|
|
148
|
+
| foo | bar | baz |
|
|
149
|
+
|
|
150
|
+
After running the action, the resulting DataFrame will look like:
|
|
151
|
+
|
|
152
|
+
| column1 | column2 | column3 | hashed_column1 | hashed_column2 |
|
|
153
|
+
|---------|---------|---------|-----------------------------------------------------------|----------------|
|
|
154
|
+
| foo | bar | baz | 17725b837e9c896e7123b142eb980131dcc0baa6160db45d4adfdb21 | 1670361220 |
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
!!! note "Hash values might vary"
|
|
158
|
+
The actual hash values will depend on the hashing algorithm used and the input data.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
name: str = "TRANSFORM_HASH_COLUMNS"
|
|
162
|
+
|
|
163
|
+
def run(
|
|
164
|
+
self,
|
|
165
|
+
context: PipelineContext,
|
|
166
|
+
*,
|
|
167
|
+
hash_config: HashConfig | None = None,
|
|
168
|
+
**_: Any,
|
|
169
|
+
) -> PipelineContext:
|
|
170
|
+
"""Hashes the specified columns in the DataFrame.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
context: Context in which this Action is executed.
|
|
174
|
+
hash_config: Dictionary that contains the configuration for executing the hashing.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Updated PipelineContext with hashed columns.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
ValueError: If columns are missing, data is None, or algorithm/bits are invalid.
|
|
181
|
+
ValueError: If the hash configuration is invalid.
|
|
182
|
+
"""
|
|
183
|
+
if context.data is None:
|
|
184
|
+
raise ValueError("Context data is required for hashing.")
|
|
185
|
+
|
|
186
|
+
if not hash_config:
|
|
187
|
+
raise ValueError("Hash configuration is required.")
|
|
188
|
+
|
|
189
|
+
df = context.data
|
|
190
|
+
|
|
191
|
+
hash_functions = {
|
|
192
|
+
"hash": lambda cols: F.hash(*[F.col(c) for c in cols]).cast("string"),
|
|
193
|
+
"xxhash64": lambda cols: F.xxhash64(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
|
|
194
|
+
"md5": lambda cols: F.md5(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
|
|
195
|
+
"sha1": lambda cols: F.sha1(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
|
|
196
|
+
"sha2": lambda cols, bits: F.sha2(F.concat_ws("||", *[F.col(c) for c in cols]), bits).cast("string"),
|
|
197
|
+
"crc32": lambda cols: F.crc32(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
|
|
198
|
+
}
|
|
199
|
+
default_sha2_bits = 256
|
|
200
|
+
|
|
201
|
+
config_obj = HashConfig.model_validate({"hash_config": hash_config})
|
|
202
|
+
for new_col, config in config_obj.hash_config.items():
|
|
203
|
+
hash_func = hash_functions[config.algorithm]
|
|
204
|
+
if config.algorithm == "sha2":
|
|
205
|
+
df = df.withColumn(new_col, hash_func(config.columns, config.bits or default_sha2_bits)) # type: ignore
|
|
206
|
+
else:
|
|
207
|
+
df = df.withColumn(new_col, hash_func(config.columns)) # type: ignore
|
|
208
|
+
|
|
209
|
+
return context.from_existing(data=df)
|
|
@@ -8,18 +8,25 @@ from ..pipeline_step import PipelineStep
|
|
|
8
8
|
class TransformJoinAction(PipelineAction):
|
|
9
9
|
"""Joins the current DataFrame with another DataFrame defined in joined_data.
|
|
10
10
|
|
|
11
|
-
The join operation is performed based on specified columns and the type of
|
|
12
|
-
indicated by the `how` parameter.
|
|
11
|
+
The join operation is performed based on specified columns and the type of
|
|
12
|
+
join indicated by the `how` parameter. Supported join types can be taken
|
|
13
|
+
from [PySpark
|
|
14
|
+
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
|
|
13
15
|
|
|
14
16
|
Example:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
```yaml
|
|
18
|
+
Join Tables:
|
|
19
|
+
action: TRANSFORM_JOIN
|
|
20
|
+
options:
|
|
21
|
+
joined_data: ((step:Transform First Table))
|
|
22
|
+
join_on: id
|
|
23
|
+
how: anti
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
!!! note "Referencing a DataFrame from another step"
|
|
27
|
+
The `joined_data` parameter is a reference to the DataFrame from another step.
|
|
28
|
+
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
29
|
+
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
23
30
|
"""
|
|
24
31
|
|
|
25
32
|
name: str = "TRANSFORM_JOIN"
|
|
@@ -14,12 +14,25 @@ class TransformJsonNormalize(PipelineAction):
|
|
|
14
14
|
structs are appended after existing columns.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
```yaml
|
|
18
|
+
Normalize Tables:
|
|
19
|
+
action: TRANSFORM_JSON_NORMALIZE
|
|
20
|
+
options:
|
|
21
|
+
exclude_columns: coordinates
|
|
22
|
+
```
|
|
23
|
+
Example Input Data:
|
|
24
|
+
|
|
25
|
+
| id | name | coordinates | attributes |
|
|
26
|
+
|----|--------|----------------------|---------------------------|
|
|
27
|
+
| 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
|
|
28
|
+
| 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
|
|
29
|
+
|
|
30
|
+
Example Output Data:
|
|
31
|
+
|
|
32
|
+
| id | name | coordinates | attributes_age | attributes_city |
|
|
33
|
+
|----|--------|-------------|----------------|-----------------|
|
|
34
|
+
| 1 | Alice | [10.0, 20.0]| 30 | NY |
|
|
35
|
+
| 2 | Bob | [30.0, 40.0]| 25 | LA |
|
|
23
36
|
"""
|
|
24
37
|
|
|
25
38
|
name: str = "TRANSFORM_JSON_NORMALIZE"
|
|
@@ -12,13 +12,13 @@ class TransformRenameColumnsAction(PipelineAction):
|
|
|
12
12
|
name and its corresponding value represents the new column name.
|
|
13
13
|
|
|
14
14
|
Example:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
15
|
+
```yaml
|
|
16
|
+
Rename Column:
|
|
17
|
+
action: TRANSFORM_RENAME_COLUMNS
|
|
18
|
+
options:
|
|
19
|
+
columns:
|
|
20
|
+
a_very_long_column_name: shortname
|
|
21
|
+
```
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
name: str = "TRANSFORM_RENAME_COLUMNS"
|
|
@@ -13,14 +13,14 @@ class TransformReplaceValuesAction(PipelineAction):
|
|
|
13
13
|
in the specified columns.
|
|
14
14
|
|
|
15
15
|
Example:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
```yaml
|
|
17
|
+
Replace Values:
|
|
18
|
+
action: TRANSFORM_REPLACE_VALUES
|
|
19
|
+
options:
|
|
20
|
+
replace:
|
|
21
|
+
empl_function:
|
|
22
|
+
sales_employee: seller
|
|
23
|
+
```
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
name: str = "TRANSFORM_REPLACE_VALUES"
|
|
@@ -14,15 +14,44 @@ class TransformSelectColumnsAction(PipelineAction):
|
|
|
14
14
|
DataFrame before performing the selection.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
Example Input Data:
|
|
18
|
+
|
|
19
|
+
| id | name | coordinates | attributes |
|
|
20
|
+
|----|--------|----------------------|---------------------------|
|
|
21
|
+
| 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
|
|
22
|
+
| 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
|
|
23
|
+
=== "Include Columns"
|
|
24
|
+
```yaml
|
|
25
|
+
Select Columns:
|
|
26
|
+
action: TRANSFORM_SELECT_COLUMNS
|
|
27
|
+
options:
|
|
28
|
+
include_columns:
|
|
29
|
+
- id
|
|
30
|
+
- name
|
|
31
|
+
- coordinates
|
|
32
|
+
```
|
|
33
|
+
Example Output Data:
|
|
34
|
+
|
|
35
|
+
| id | name | coordinates |
|
|
36
|
+
|----|--------|----------------------|
|
|
37
|
+
| 1 | Alice | [10.0, 20.0] |
|
|
38
|
+
| 2 | Bob | [30.0, 40.0] |
|
|
39
|
+
|
|
40
|
+
=== "Exclude Columns"
|
|
41
|
+
```yaml
|
|
42
|
+
Select Columns:
|
|
43
|
+
action: TRANSFORM_SELECT_COLUMNS
|
|
44
|
+
options:
|
|
45
|
+
exclude_columns:
|
|
46
|
+
- coordinates
|
|
47
|
+
```
|
|
48
|
+
Example Output Data:
|
|
49
|
+
|
|
50
|
+
| id | name | attributes |
|
|
51
|
+
|----|--------|---------------------------|
|
|
52
|
+
| 1 | Alice | {"age": 30, "city": "NY"} |
|
|
53
|
+
| 2 | Bob | {"age": 25, "city": "LA"} |
|
|
54
|
+
|
|
26
55
|
"""
|
|
27
56
|
|
|
28
57
|
name: str = "TRANSFORM_SELECT_COLUMNS"
|
|
@@ -17,14 +17,18 @@ class TransformUnionAction(PipelineAction):
|
|
|
17
17
|
empty, a ValueError will be raised.
|
|
18
18
|
|
|
19
19
|
Example:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
```yaml
|
|
21
|
+
Union Tables:
|
|
22
|
+
action: TRANSFORM_UNION
|
|
23
|
+
options:
|
|
24
|
+
union_data:
|
|
25
|
+
- ((step: Filter First Table))
|
|
26
|
+
- ((step: SQL Transform Second Table))
|
|
27
|
+
```
|
|
28
|
+
!!! note "Referencing a DataFrame from another step"
|
|
29
|
+
The `union_data` parameter is a reference to the DataFrame from another step.
|
|
30
|
+
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
31
|
+
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
28
32
|
"""
|
|
29
33
|
|
|
30
34
|
name: str = "TRANSFORM_UNION"
|
|
@@ -9,15 +9,16 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
9
9
|
"""Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
|
|
10
10
|
|
|
11
11
|
Example:
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
12
|
+
```yaml
|
|
13
|
+
Write Table to Catalog:
|
|
14
|
+
action: WRITE_CATALOG_TABLE
|
|
15
|
+
options:
|
|
16
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
17
|
+
mode: append
|
|
18
|
+
partition_by: day
|
|
19
|
+
options:
|
|
20
|
+
mergeSchema: true
|
|
21
|
+
```
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
name: str = "WRITE_CATALOG_TABLE"
|
|
@@ -42,7 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
42
43
|
mode: The write mode. One of 'append', 'overwrite', 'error',
|
|
43
44
|
'errorifexists', or 'ignore'.
|
|
44
45
|
partition_by: Names of the partitioning columns.
|
|
45
|
-
options:
|
|
46
|
+
options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
|
|
46
47
|
|
|
47
48
|
Raises:
|
|
48
49
|
ValueError: If the table name is not specified or cannot be inferred from
|