cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformDistinctAction(PipelineAction):
|
|
8
|
+
"""Selects distinct rows from the DataFrame in the given context.
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
```yaml
|
|
12
|
+
Decode Columns:
|
|
13
|
+
action: TRANSFORM_DISTINCT
|
|
14
|
+
```
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
name: str = "TRANSFORM_DISTINCT"
|
|
18
|
+
|
|
19
|
+
def run(
|
|
20
|
+
self,
|
|
21
|
+
context: PipelineContext,
|
|
22
|
+
**_: Any,
|
|
23
|
+
) -> PipelineContext:
|
|
24
|
+
"""Selects distinct rows from the DataFrame in the given context.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
context: The context in which this Action is executed.
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: If the data from the context is None.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The context after the execution of this Action, containing the DataFrame with distinct rows.
|
|
34
|
+
"""
|
|
35
|
+
if context.data is None:
|
|
36
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
37
|
+
|
|
38
|
+
df = context.data.distinct()
|
|
39
|
+
|
|
40
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformFilterAction(PipelineAction):
|
|
8
|
+
"""Filters the DataFrame in the given context based on a specified condition.
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
```yaml
|
|
12
|
+
Decode Columns:
|
|
13
|
+
action: TRANSFORM_FILTER
|
|
14
|
+
options:
|
|
15
|
+
condition: where city="Hamburg"
|
|
16
|
+
```
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name: str = "TRANSFORM_FILTER"
|
|
20
|
+
|
|
21
|
+
def run(
|
|
22
|
+
self,
|
|
23
|
+
context: PipelineContext,
|
|
24
|
+
*,
|
|
25
|
+
condition: str = "",
|
|
26
|
+
**_: Any,
|
|
27
|
+
) -> PipelineContext:
|
|
28
|
+
"""Filters the DataFrame in the given context based on a specified condition.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
context: Context in which this Action is executed.
|
|
32
|
+
condition: A SQL-like expression used to filter the DataFrame.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ValueError: If no condition is provided.
|
|
36
|
+
ValueError: If the data from the context is None.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Context after the execution of this Action, containing the filtered DataFrame.
|
|
40
|
+
"""
|
|
41
|
+
if not condition:
|
|
42
|
+
raise ValueError("No condition provided.")
|
|
43
|
+
|
|
44
|
+
if context.data is None:
|
|
45
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
46
|
+
|
|
47
|
+
df = context.data
|
|
48
|
+
|
|
49
|
+
df_filtered = df.filter(condition=condition)
|
|
50
|
+
|
|
51
|
+
return context.from_existing(data=df_filtered) # type: ignore
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from ...session import SessionManager
|
|
5
|
+
from ..pipeline_action import PipelineAction
|
|
6
|
+
from ..pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformSqlAction(PipelineAction):
|
|
10
|
+
"""Executes a SQL statement on a DataFrame within the provided context.
|
|
11
|
+
|
|
12
|
+
A temporary view is created from the current DataFrame, and the SQL
|
|
13
|
+
statement is executed on that view. The resulting DataFrame is returned.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
```yaml
|
|
17
|
+
SQL Transform:
|
|
18
|
+
action: TRANSFORM_SQL
|
|
19
|
+
options:
|
|
20
|
+
sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name: str = "TRANSFORM_SQL"
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
context: PipelineContext,
|
|
29
|
+
*,
|
|
30
|
+
sql_statement: str = "",
|
|
31
|
+
**kwargs: Any,
|
|
32
|
+
) -> PipelineContext:
|
|
33
|
+
"""Executes a SQL statement on a DataFrame within the provided context.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
context: Context in which this Action is executed.
|
|
37
|
+
sql_statement: A string containing the SQL statement to be
|
|
38
|
+
executed. The source table should be referred to as "{DATA_FRAME}".
|
|
39
|
+
**kwargs: Additional keyword arguments are passed as placeholders to the
|
|
40
|
+
SQL statement.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If "{DATA_FRAME}" is not included in the SQL statement.
|
|
44
|
+
ValueError: If no SQL statement is provided.
|
|
45
|
+
ValueError: If the data from the context is None.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Context after the execution of this Action, containing the DataFrame resulting from the SQL statement.
|
|
49
|
+
"""
|
|
50
|
+
if not sql_statement:
|
|
51
|
+
raise ValueError("No SQL statement provided.")
|
|
52
|
+
|
|
53
|
+
if context.data is None:
|
|
54
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
55
|
+
|
|
56
|
+
_spark = SessionManager.get_spark_session()
|
|
57
|
+
|
|
58
|
+
temp_view_name = str(uuid.uuid1()).replace("-", "_")
|
|
59
|
+
context.data.createTempView(temp_view_name)
|
|
60
|
+
|
|
61
|
+
if "FROM {DATA_FRAME}".casefold() not in sql_statement.casefold():
|
|
62
|
+
raise ValueError("Please use 'FROM {DATA_FRAME}' in your SQL statement.")
|
|
63
|
+
|
|
64
|
+
df = _spark.sql(sql_statement.format(DATA_FRAME=temp_view_name, **kwargs))
|
|
65
|
+
|
|
66
|
+
return context.from_existing(data=df)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
from ..pipeline_step import PipelineStep
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TransformJoinAction(PipelineAction):
|
|
9
|
+
"""Joins the current DataFrame with another DataFrame defined in joined_data.
|
|
10
|
+
|
|
11
|
+
The join operation is performed based on specified columns and the type of join
|
|
12
|
+
indicated by the `how` parameter.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Join Tables:
|
|
17
|
+
action: TRANSFORM_JOIN
|
|
18
|
+
options:
|
|
19
|
+
joined_data: ((step:Transform First Table))
|
|
20
|
+
join_on: id
|
|
21
|
+
how: anti
|
|
22
|
+
```
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name: str = "TRANSFORM_JOIN"
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
context: PipelineContext,
|
|
30
|
+
*,
|
|
31
|
+
joined_data: PipelineStep | None = None,
|
|
32
|
+
join_on: list[str] | str | dict[str, str] | None = None,
|
|
33
|
+
how: str = "inner",
|
|
34
|
+
**_: Any,
|
|
35
|
+
) -> PipelineContext:
|
|
36
|
+
"""Joins the current DataFrame with another DataFrame defined in joined_data.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
context: Context in which this Action is executed.
|
|
40
|
+
joined_data: The PipelineStep context defining the DataFrame
|
|
41
|
+
to join with as the right side of the join.
|
|
42
|
+
join_on: A string for the join column
|
|
43
|
+
name, a list of column names, or a dictionary mapping columns from the
|
|
44
|
+
left DataFrame to the right DataFrame. This defines the condition for the
|
|
45
|
+
join operation.
|
|
46
|
+
how: The type of join to perform. Must be one of: inner, cross, outer,
|
|
47
|
+
full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If no joined_data is provided.
|
|
51
|
+
ValueError: If no join_on is provided.
|
|
52
|
+
ValueError: If the data from context is None.
|
|
53
|
+
ValueError: If the data from the joined_data is None.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Context after the execution of this Action, containing the result of the join operation.
|
|
57
|
+
"""
|
|
58
|
+
if joined_data is None or joined_data.result is None or joined_data.result.data is None:
|
|
59
|
+
raise ValueError("No joined_data provided.")
|
|
60
|
+
if not join_on:
|
|
61
|
+
raise ValueError("No join_on provided.")
|
|
62
|
+
|
|
63
|
+
if context.data is None:
|
|
64
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
65
|
+
|
|
66
|
+
df_right = joined_data.result.data.alias("right") # type: ignore
|
|
67
|
+
df_left = context.data.alias("left") # type: ignore
|
|
68
|
+
|
|
69
|
+
if isinstance(join_on, str):
|
|
70
|
+
join_condition = [join_on]
|
|
71
|
+
elif isinstance(join_on, list):
|
|
72
|
+
join_condition = join_on
|
|
73
|
+
else:
|
|
74
|
+
join_condition = [
|
|
75
|
+
df_left[left_column] == df_right[right_column] # type: ignore
|
|
76
|
+
for left_column, right_column in join_on.items()
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
df = df_left.join(df_right, on=join_condition, how=how) # type: ignore
|
|
80
|
+
|
|
81
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Any, cast
|
|
2
|
+
|
|
3
|
+
import pyspark.sql.functions as F
|
|
4
|
+
|
|
5
|
+
from cloe_nessy.pipeline.pipeline_action import PipelineAction
|
|
6
|
+
from cloe_nessy.pipeline.pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformJsonNormalize(PipelineAction):
|
|
10
|
+
"""Normalizes and flattens the DataFrame by exploding array columns and flattening struct columns.
|
|
11
|
+
|
|
12
|
+
The method performs recursive normalization on the DataFrame present in the context,
|
|
13
|
+
ensuring that the order of columns is retained and new columns created by flattening
|
|
14
|
+
structs are appended after existing columns.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
```yaml
|
|
18
|
+
Normalize Tables:
|
|
19
|
+
action: TRANSFORM_JSON_NORMALIZE
|
|
20
|
+
options:
|
|
21
|
+
exclude_columns: coordinates
|
|
22
|
+
```
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name: str = "TRANSFORM_JSON_NORMALIZE"
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
context: PipelineContext,
|
|
30
|
+
*,
|
|
31
|
+
exclude_columns: list[str] | None = None,
|
|
32
|
+
**_: Any,
|
|
33
|
+
) -> PipelineContext:
|
|
34
|
+
"""Executes the normalization process on the DataFrame present in the context.
|
|
35
|
+
|
|
36
|
+
Please note that columns retain their relative order during the
|
|
37
|
+
normalization process, and new columns created by flattening structs are
|
|
38
|
+
appended after the existing columns.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
context: The pipeline context that contains the DataFrame to be normalized.
|
|
42
|
+
exclude_columns: A list of column names to exclude from the normalization process.
|
|
43
|
+
These columns will not be exploded or flattened.
|
|
44
|
+
**_: Additional keyword arguments (not used).
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
A new pipeline context with the normalized DataFrame.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If the DataFrame in the context is `None`.
|
|
51
|
+
"""
|
|
52
|
+
if context.data is None:
|
|
53
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
54
|
+
|
|
55
|
+
if not exclude_columns:
|
|
56
|
+
exclude_columns = []
|
|
57
|
+
df = TransformJsonNormalize._normalize(context.data, exclude_columns=cast(list, exclude_columns))
|
|
58
|
+
return context.from_existing(data=df)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _normalize(df, exclude_columns):
|
|
62
|
+
"""Recursively normalizes the given DataFrame by exploding arrays and flattening structs.
|
|
63
|
+
|
|
64
|
+
This method performs two primary operations:
|
|
65
|
+
1. Explodes any array columns, unless they are in the list of excluded columns.
|
|
66
|
+
2. Flattens any struct columns, renaming nested fields and appending them to the top-level DataFrame.
|
|
67
|
+
|
|
68
|
+
The method continues these operations in a loop until there are no array or struct columns left.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
df: The input DataFrame to normalize.
|
|
72
|
+
exclude_columns: A list of column names to exclude from the normalization process. These columns
|
|
73
|
+
will not be exploded or flattened.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
pyspark.sql.DataFrame: The normalized DataFrame with no array or struct columns.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def explode_arrays(df, exclude_columns):
|
|
80
|
+
array_present = False
|
|
81
|
+
for col in df.columns:
|
|
82
|
+
if df.schema[col].dataType.typeName() == "array" and col not in exclude_columns:
|
|
83
|
+
df = df.withColumn(col, F.explode(col))
|
|
84
|
+
array_present = True
|
|
85
|
+
return df, array_present
|
|
86
|
+
|
|
87
|
+
def flatten_structs(df):
|
|
88
|
+
struct_present = False
|
|
89
|
+
struct_columns = [col for col in df.columns if df.schema[col].dataType.typeName() == "struct"]
|
|
90
|
+
for col in struct_columns:
|
|
91
|
+
df = df.select(F.col("*"), F.col(col + ".*"))
|
|
92
|
+
nested_columns = df.select(F.col(col + ".*")).schema.names
|
|
93
|
+
for nested_col in nested_columns:
|
|
94
|
+
df = df.withColumnRenamed(nested_col, f"{col}_{nested_col}")
|
|
95
|
+
df = df.drop(col)
|
|
96
|
+
struct_present = True
|
|
97
|
+
return df, struct_present
|
|
98
|
+
|
|
99
|
+
array_present = True
|
|
100
|
+
struct_present = True
|
|
101
|
+
|
|
102
|
+
while array_present or struct_present:
|
|
103
|
+
df, array_present = explode_arrays(df, exclude_columns)
|
|
104
|
+
df, struct_present = flatten_structs(df)
|
|
105
|
+
|
|
106
|
+
return df
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformRenameColumnsAction(PipelineAction):
|
|
8
|
+
"""Renames the specified columns in the DataFrame.
|
|
9
|
+
|
|
10
|
+
This method updates the DataFrame in the provided context by renaming columns according
|
|
11
|
+
to the mapping defined in the `columns` dictionary, where each key represents an old column
|
|
12
|
+
name and its corresponding value represents the new column name.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Rename Column:
|
|
17
|
+
action: TRANSFORM_RENAME_COLUMNS
|
|
18
|
+
options:
|
|
19
|
+
columns:
|
|
20
|
+
a_very_long_column_name: shortname
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name: str = "TRANSFORM_RENAME_COLUMNS"
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
context: PipelineContext,
|
|
29
|
+
*,
|
|
30
|
+
columns: dict[str, str] | None = None,
|
|
31
|
+
**_: Any,
|
|
32
|
+
) -> PipelineContext:
|
|
33
|
+
"""Renames the specified columns in the DataFrame.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
context: Context in which this Action is executed.
|
|
37
|
+
columns: A dictionary where the key is the old column name
|
|
38
|
+
and the value is the new column name.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
ValueError: If no columns are provided.
|
|
42
|
+
ValueError: If the data from context is None.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Context after the execution of this Action.
|
|
46
|
+
"""
|
|
47
|
+
if not columns:
|
|
48
|
+
raise ValueError("No columns provided.")
|
|
49
|
+
|
|
50
|
+
if context.data is None:
|
|
51
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
52
|
+
|
|
53
|
+
df = context.data
|
|
54
|
+
|
|
55
|
+
if isinstance(columns, dict):
|
|
56
|
+
df = df.withColumnsRenamed(columns)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("'columns' should be a dict, like {'old_name_1':'new_name_1', 'old_name_2':'new_name_2'}")
|
|
59
|
+
|
|
60
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformReplaceValuesAction(PipelineAction):
|
|
8
|
+
"""Replaces specified values in the given DataFrame.
|
|
9
|
+
|
|
10
|
+
This method iterates over the specified `replace` dictionary, where each key is a column name
|
|
11
|
+
and each value is another dictionary containing old values as keys and new values as the corresponding
|
|
12
|
+
values. The method updates the DataFrame by replacing occurrences of the old values with the new ones
|
|
13
|
+
in the specified columns.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
```yaml
|
|
17
|
+
Replace Values:
|
|
18
|
+
action: TRANSFORM_REPLACE_VALUES
|
|
19
|
+
options:
|
|
20
|
+
replace:
|
|
21
|
+
empl_function:
|
|
22
|
+
sales_employee: seller
|
|
23
|
+
```
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name: str = "TRANSFORM_REPLACE_VALUES"
|
|
27
|
+
|
|
28
|
+
def run(
|
|
29
|
+
self,
|
|
30
|
+
context: PipelineContext,
|
|
31
|
+
*,
|
|
32
|
+
replace: dict[str, dict[str, str]] | None = None,
|
|
33
|
+
**_: Any,
|
|
34
|
+
) -> PipelineContext:
|
|
35
|
+
"""Replaces specified values in the given DataFrame.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
context: Context in which this Action is executed.
|
|
39
|
+
replace: A dictionary where each key is the column name
|
|
40
|
+
and the corresponding value is another dictionary mapping old values to new values.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If no replace values are provided.
|
|
44
|
+
ValueError: If the data from context is None.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Context after the execution of this Action.
|
|
48
|
+
"""
|
|
49
|
+
if not replace:
|
|
50
|
+
raise ValueError("No replace values provided.")
|
|
51
|
+
|
|
52
|
+
if context.data is None:
|
|
53
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
54
|
+
|
|
55
|
+
df = context.data
|
|
56
|
+
for column, to_replace in replace.items():
|
|
57
|
+
df = df.replace(to_replace=to_replace, subset=[column]) # type: ignore
|
|
58
|
+
|
|
59
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from ..pipeline_context import PipelineContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TransformSelectColumnsAction(PipelineAction):
|
|
8
|
+
"""Selects specified columns from the given DataFrame.
|
|
9
|
+
|
|
10
|
+
This method allows you to include or exclude specific columns from the
|
|
11
|
+
DataFrame. If `include_columns` is provided, only those columns will be
|
|
12
|
+
selected. If `exclude_columns` is provided, all columns except those will be
|
|
13
|
+
selected. The method ensures that the specified columns exist in the
|
|
14
|
+
DataFrame before performing the selection.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
```yaml
|
|
18
|
+
Select Columns:
|
|
19
|
+
action: TRANSFORM_SELECT_COLUMNS
|
|
20
|
+
options:
|
|
21
|
+
include_columns:
|
|
22
|
+
- id
|
|
23
|
+
- city
|
|
24
|
+
- product
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "TRANSFORM_SELECT_COLUMNS"
|
|
29
|
+
|
|
30
|
+
def run(
|
|
31
|
+
self,
|
|
32
|
+
context: PipelineContext,
|
|
33
|
+
*,
|
|
34
|
+
include_columns: list[str] | None = None,
|
|
35
|
+
exclude_columns: list[str] | None = None,
|
|
36
|
+
raise_on_non_existing_columns: bool = True,
|
|
37
|
+
**_: Any,
|
|
38
|
+
) -> PipelineContext:
|
|
39
|
+
"""Selects specified columns from the given DataFrame.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
context: Context in which this Action is executed.
|
|
43
|
+
include_columns: A list of column names that should be included.
|
|
44
|
+
If provided, only these columns will be selected.
|
|
45
|
+
exclude_columns: A list of column names that should be excluded.
|
|
46
|
+
If provided, all columns except these will be selected.
|
|
47
|
+
raise_on_non_existing_columns: If True, raise an error if a specified
|
|
48
|
+
column is not found in the DataFrame. If False, ignore the column
|
|
49
|
+
and continue with the selection.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If a specified column is not found in the DataFrame.
|
|
53
|
+
ValueError: If neither include_columns nor exclude_columns are provided,
|
|
54
|
+
or if both are provided.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Context after the execution of this Action.
|
|
58
|
+
"""
|
|
59
|
+
if context.data is None:
|
|
60
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
61
|
+
|
|
62
|
+
df = context.data
|
|
63
|
+
|
|
64
|
+
if (not include_columns and not exclude_columns) or (include_columns and exclude_columns):
|
|
65
|
+
raise ValueError("Please define either 'include_columns' or 'exclude_columns'.")
|
|
66
|
+
|
|
67
|
+
def check_missing_columns(df, columns, raise_on_non_existing_columns):
|
|
68
|
+
if raise_on_non_existing_columns:
|
|
69
|
+
missing_columns = [col for col in columns if col not in df.columns]
|
|
70
|
+
if missing_columns:
|
|
71
|
+
raise ValueError(f"Columns not found in DataFrame: {missing_columns}")
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
if include_columns:
|
|
75
|
+
check_missing_columns(df, include_columns, raise_on_non_existing_columns)
|
|
76
|
+
df_selected = df.select(*include_columns)
|
|
77
|
+
elif exclude_columns:
|
|
78
|
+
check_missing_columns(df, exclude_columns, raise_on_non_existing_columns)
|
|
79
|
+
df_selected = df.drop(*exclude_columns)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
raise ValueError(f"Column selection error: {e}") from e
|
|
82
|
+
|
|
83
|
+
return context.from_existing(data=df_selected) # type: ignore
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from functools import reduce
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pyspark.sql.dataframe import DataFrame
|
|
5
|
+
|
|
6
|
+
from ..pipeline_action import PipelineAction
|
|
7
|
+
from ..pipeline_context import PipelineContext
|
|
8
|
+
from ..pipeline_step import PipelineStep
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TransformUnionAction(PipelineAction):
|
|
12
|
+
"""Unions multiple DataFrames together.
|
|
13
|
+
|
|
14
|
+
This method takes the current DataFrame from the context and unites it with
|
|
15
|
+
additional DataFrames specified in the `union_data` argument. All DataFrames
|
|
16
|
+
must have the same schema. If any DataFrame in `union_data` is None or
|
|
17
|
+
empty, a ValueError will be raised.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
```yaml
|
|
21
|
+
Union Tables:
|
|
22
|
+
action: TRANSFORM_UNION
|
|
23
|
+
options:
|
|
24
|
+
union_data:
|
|
25
|
+
- ((step: Filter First Table))
|
|
26
|
+
- ((step: SQL Transform Second Table))
|
|
27
|
+
```
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name: str = "TRANSFORM_UNION"
|
|
31
|
+
|
|
32
|
+
def run(
|
|
33
|
+
self,
|
|
34
|
+
context: PipelineContext,
|
|
35
|
+
*,
|
|
36
|
+
union_data: list[PipelineStep] | None = None,
|
|
37
|
+
**_: Any,
|
|
38
|
+
) -> PipelineContext:
|
|
39
|
+
"""Unions multiple DataFrames together.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
context: Context in which this Action is executed.
|
|
43
|
+
union_data: A list of PipelineSteps that define the DataFrames
|
|
44
|
+
to union with the current context.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If no union_data is provided.
|
|
48
|
+
ValueError: If the data from context is None.
|
|
49
|
+
ValueError: If the data from any of the union_data is None.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Context after the execution of this Action.
|
|
53
|
+
"""
|
|
54
|
+
if not union_data:
|
|
55
|
+
raise ValueError("No union_data provided.")
|
|
56
|
+
|
|
57
|
+
# Check that all union_data contexts have valid data
|
|
58
|
+
result_contexts = []
|
|
59
|
+
if context.data is None:
|
|
60
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
61
|
+
|
|
62
|
+
for ctx in union_data:
|
|
63
|
+
if ctx.result is None or ctx.result.data is None:
|
|
64
|
+
raise ValueError(f"Data from the context of step '{ctx.name}' is required for the operation.")
|
|
65
|
+
result_contexts.append(ctx.result.data)
|
|
66
|
+
|
|
67
|
+
# Union all DataFrames
|
|
68
|
+
union_dfs = [context.data] + result_contexts
|
|
69
|
+
df = reduce(DataFrame.unionAll, union_dfs) # type: ignore
|
|
70
|
+
|
|
71
|
+
return context.from_existing(data=df) # type: ignore
|