cloe-nessy 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +5 -0
  3. cloe_nessy/clients/api_client/__init__.py +3 -0
  4. cloe_nessy/clients/api_client/api_client.py +188 -0
  5. cloe_nessy/clients/api_client/api_response.py +72 -0
  6. cloe_nessy/clients/api_client/auth.py +178 -0
  7. cloe_nessy/clients/api_client/exceptions.py +22 -0
  8. cloe_nessy/file_utilities/__init__.py +3 -0
  9. cloe_nessy/file_utilities/exceptions.py +4 -0
  10. cloe_nessy/file_utilities/factory.py +42 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +72 -0
  12. cloe_nessy/file_utilities/location_types.py +29 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +6 -0
  20. cloe_nessy/integration/reader/api_reader.py +141 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +49 -0
  22. cloe_nessy/integration/reader/excel_reader.py +170 -0
  23. cloe_nessy/integration/reader/exceptions.py +10 -0
  24. cloe_nessy/integration/reader/file_reader.py +96 -0
  25. cloe_nessy/integration/reader/reader.py +34 -0
  26. cloe_nessy/integration/writer/__init__.py +3 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +48 -0
  28. cloe_nessy/logging/__init__.py +3 -0
  29. cloe_nessy/logging/logger_mixin.py +162 -0
  30. cloe_nessy/models/__init__.py +13 -0
  31. cloe_nessy/models/column.py +65 -0
  32. cloe_nessy/models/constraint.py +9 -0
  33. cloe_nessy/models/foreign_key.py +34 -0
  34. cloe_nessy/models/mixins/__init__.py +0 -0
  35. cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
  36. cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
  37. cloe_nessy/models/schema.py +76 -0
  38. cloe_nessy/models/table.py +236 -0
  39. cloe_nessy/models/types.py +7 -0
  40. cloe_nessy/object_manager/__init__.py +3 -0
  41. cloe_nessy/object_manager/table_manager.py +58 -0
  42. cloe_nessy/pipeline/__init__.py +7 -0
  43. cloe_nessy/pipeline/actions/__init__.py +50 -0
  44. cloe_nessy/pipeline/actions/read_api.py +178 -0
  45. cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
  46. cloe_nessy/pipeline/actions/read_excel.py +177 -0
  47. cloe_nessy/pipeline/actions/read_files.py +105 -0
  48. cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
  49. cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
  50. cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
  51. cloe_nessy/pipeline/actions/transform_decode.py +102 -0
  52. cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
  53. cloe_nessy/pipeline/actions/transform_filter.py +51 -0
  54. cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
  55. cloe_nessy/pipeline/actions/transform_join.py +81 -0
  56. cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
  57. cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
  58. cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
  59. cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
  60. cloe_nessy/pipeline/actions/transform_union.py +71 -0
  61. cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
  62. cloe_nessy/pipeline/pipeline.py +201 -0
  63. cloe_nessy/pipeline/pipeline_action.py +62 -0
  64. cloe_nessy/pipeline/pipeline_config.py +92 -0
  65. cloe_nessy/pipeline/pipeline_context.py +56 -0
  66. cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
  67. cloe_nessy/pipeline/pipeline_step.py +50 -0
  68. cloe_nessy/py.typed +0 -0
  69. cloe_nessy/session/__init__.py +3 -0
  70. cloe_nessy/session/session_manager.py +188 -0
  71. cloe_nessy/settings/__init__.py +3 -0
  72. cloe_nessy/settings/settings.py +91 -0
  73. cloe_nessy/utils/__init__.py +0 -0
  74. cloe_nessy/utils/file_and_directory_handler.py +19 -0
  75. cloe_nessy-0.2.9.dist-info/METADATA +26 -0
  76. cloe_nessy-0.2.9.dist-info/RECORD +78 -0
  77. cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
  78. cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,40 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+
6
+
7
+ class TransformDistinctAction(PipelineAction):
8
+ """Selects distinct rows from the DataFrame in the given context.
9
+
10
+ Example:
11
+ ```yaml
12
+ Decode Columns:
13
+ action: TRANSFORM_DISTINCT
14
+ ```
15
+ """
16
+
17
+ name: str = "TRANSFORM_DISTINCT"
18
+
19
+ def run(
20
+ self,
21
+ context: PipelineContext,
22
+ **_: Any,
23
+ ) -> PipelineContext:
24
+ """Selects distinct rows from the DataFrame in the given context.
25
+
26
+ Args:
27
+ context: The context in which this Action is executed.
28
+
29
+ Raises:
30
+ ValueError: If the data from the context is None.
31
+
32
+ Returns:
33
+ The context after the execution of this Action, containing the DataFrame with distinct rows.
34
+ """
35
+ if context.data is None:
36
+ raise ValueError("Data from the context is required for the operation.")
37
+
38
+ df = context.data.distinct()
39
+
40
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,51 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+
6
+
7
+ class TransformFilterAction(PipelineAction):
8
+ """Filters the DataFrame in the given context based on a specified condition.
9
+
10
+ Example:
11
+ ```yaml
12
+ Decode Columns:
13
+ action: TRANSFORM_FILTER
14
+ options:
15
+ condition: where city="Hamburg"
16
+ ```
17
+ """
18
+
19
+ name: str = "TRANSFORM_FILTER"
20
+
21
+ def run(
22
+ self,
23
+ context: PipelineContext,
24
+ *,
25
+ condition: str = "",
26
+ **_: Any,
27
+ ) -> PipelineContext:
28
+ """Filters the DataFrame in the given context based on a specified condition.
29
+
30
+ Args:
31
+ context: Context in which this Action is executed.
32
+ condition: A SQL-like expression used to filter the DataFrame.
33
+
34
+ Raises:
35
+ ValueError: If no condition is provided.
36
+ ValueError: If the data from the context is None.
37
+
38
+ Returns:
39
+ Context after the execution of this Action, containing the filtered DataFrame.
40
+ """
41
+ if not condition:
42
+ raise ValueError("No condition provided.")
43
+
44
+ if context.data is None:
45
+ raise ValueError("Data from the context is required for the operation.")
46
+
47
+ df = context.data
48
+
49
+ df_filtered = df.filter(condition=condition)
50
+
51
+ return context.from_existing(data=df_filtered) # type: ignore
@@ -0,0 +1,66 @@
1
+ import uuid
2
+ from typing import Any
3
+
4
+ from ...session import SessionManager
5
+ from ..pipeline_action import PipelineAction
6
+ from ..pipeline_context import PipelineContext
7
+
8
+
9
+ class TransformSqlAction(PipelineAction):
10
+ """Executes a SQL statement on a DataFrame within the provided context.
11
+
12
+ A temporary view is created from the current DataFrame, and the SQL
13
+ statement is executed on that view. The resulting DataFrame is returned.
14
+
15
+ Example:
16
+ ```yaml
17
+ SQL Transform:
18
+ action: TRANSFORM_SQL
19
+ options:
20
+ sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
21
+ ```
22
+ """
23
+
24
+ name: str = "TRANSFORM_SQL"
25
+
26
+ def run(
27
+ self,
28
+ context: PipelineContext,
29
+ *,
30
+ sql_statement: str = "",
31
+ **kwargs: Any,
32
+ ) -> PipelineContext:
33
+ """Executes a SQL statement on a DataFrame within the provided context.
34
+
35
+ Args:
36
+ context: Context in which this Action is executed.
37
+ sql_statement: A string containing the SQL statement to be
38
+ executed. The source table should be referred to as "{DATA_FRAME}".
39
+ **kwargs: Additional keyword arguments are passed as placeholders to the
40
+ SQL statement.
41
+
42
+ Raises:
43
+ ValueError: If "{DATA_FRAME}" is not included in the SQL statement.
44
+ ValueError: If no SQL statement is provided.
45
+ ValueError: If the data from the context is None.
46
+
47
+ Returns:
48
+ Context after the execution of this Action, containing the DataFrame resulting from the SQL statement.
49
+ """
50
+ if not sql_statement:
51
+ raise ValueError("No SQL statement provided.")
52
+
53
+ if context.data is None:
54
+ raise ValueError("Data from the context is required for the operation.")
55
+
56
+ _spark = SessionManager.get_spark_session()
57
+
58
+ temp_view_name = str(uuid.uuid1()).replace("-", "_")
59
+ context.data.createTempView(temp_view_name)
60
+
61
+ if "FROM {DATA_FRAME}".casefold() not in sql_statement.casefold():
62
+ raise ValueError("Please use 'FROM {DATA_FRAME}' in your SQL statement.")
63
+
64
+ df = _spark.sql(sql_statement.format(DATA_FRAME=temp_view_name, **kwargs))
65
+
66
+ return context.from_existing(data=df)
@@ -0,0 +1,81 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+ from ..pipeline_step import PipelineStep
6
+
7
+
8
+ class TransformJoinAction(PipelineAction):
9
+ """Joins the current DataFrame with another DataFrame defined in joined_data.
10
+
11
+ The join operation is performed based on specified columns and the type of join
12
+ indicated by the `how` parameter.
13
+
14
+ Example:
15
+ ```yaml
16
+ Join Tables:
17
+ action: TRANSFORM_JOIN
18
+ options:
19
+ joined_data: ((step:Transform First Table))
20
+ join_on: id
21
+ how: anti
22
+ ```
23
+ """
24
+
25
+ name: str = "TRANSFORM_JOIN"
26
+
27
+ def run(
28
+ self,
29
+ context: PipelineContext,
30
+ *,
31
+ joined_data: PipelineStep | None = None,
32
+ join_on: list[str] | str | dict[str, str] | None = None,
33
+ how: str = "inner",
34
+ **_: Any,
35
+ ) -> PipelineContext:
36
+ """Joins the current DataFrame with another DataFrame defined in joined_data.
37
+
38
+ Args:
39
+ context: Context in which this Action is executed.
40
+ joined_data: The PipelineStep context defining the DataFrame
41
+ to join with as the right side of the join.
42
+ join_on: A string for the join column
43
+ name, a list of column names, or a dictionary mapping columns from the
44
+ left DataFrame to the right DataFrame. This defines the condition for the
45
+ join operation.
46
+ how: The type of join to perform. Must be one of: inner, cross, outer,
47
+ full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
48
+
49
+ Raises:
50
+ ValueError: If no joined_data is provided.
51
+ ValueError: If no join_on is provided.
52
+ ValueError: If the data from context is None.
53
+ ValueError: If the data from the joined_data is None.
54
+
55
+ Returns:
56
+ Context after the execution of this Action, containing the result of the join operation.
57
+ """
58
+ if joined_data is None or joined_data.result is None or joined_data.result.data is None:
59
+ raise ValueError("No joined_data provided.")
60
+ if not join_on:
61
+ raise ValueError("No join_on provided.")
62
+
63
+ if context.data is None:
64
+ raise ValueError("Data from the context is required for the operation.")
65
+
66
+ df_right = joined_data.result.data.alias("right") # type: ignore
67
+ df_left = context.data.alias("left") # type: ignore
68
+
69
+ if isinstance(join_on, str):
70
+ join_condition = [join_on]
71
+ elif isinstance(join_on, list):
72
+ join_condition = join_on
73
+ else:
74
+ join_condition = [
75
+ df_left[left_column] == df_right[right_column] # type: ignore
76
+ for left_column, right_column in join_on.items()
77
+ ]
78
+
79
+ df = df_left.join(df_right, on=join_condition, how=how) # type: ignore
80
+
81
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,106 @@
1
+ from typing import Any, cast
2
+
3
+ import pyspark.sql.functions as F
4
+
5
+ from cloe_nessy.pipeline.pipeline_action import PipelineAction
6
+ from cloe_nessy.pipeline.pipeline_context import PipelineContext
7
+
8
+
9
+ class TransformJsonNormalize(PipelineAction):
10
+ """Normalizes and flattens the DataFrame by exploding array columns and flattening struct columns.
11
+
12
+ The method performs recursive normalization on the DataFrame present in the context,
13
+ ensuring that the order of columns is retained and new columns created by flattening
14
+ structs are appended after existing columns.
15
+
16
+ Example:
17
+ ```yaml
18
+ Normalize Tables:
19
+ action: TRANSFORM_JSON_NORMALIZE
20
+ options:
21
+ exclude_columns: coordinates
22
+ ```
23
+ """
24
+
25
+ name: str = "TRANSFORM_JSON_NORMALIZE"
26
+
27
+ def run(
28
+ self,
29
+ context: PipelineContext,
30
+ *,
31
+ exclude_columns: list[str] | None = None,
32
+ **_: Any,
33
+ ) -> PipelineContext:
34
+ """Executes the normalization process on the DataFrame present in the context.
35
+
36
+ Please note that columns retain their relative order during the
37
+ normalization process, and new columns created by flattening structs are
38
+ appended after the existing columns.
39
+
40
+ Args:
41
+ context: The pipeline context that contains the DataFrame to be normalized.
42
+ exclude_columns: A list of column names to exclude from the normalization process.
43
+ These columns will not be exploded or flattened.
44
+ **_: Additional keyword arguments (not used).
45
+
46
+ Returns:
47
+ A new pipeline context with the normalized DataFrame.
48
+
49
+ Raises:
50
+ ValueError: If the DataFrame in the context is `None`.
51
+ """
52
+ if context.data is None:
53
+ raise ValueError("Data from the context is required for the operation.")
54
+
55
+ if not exclude_columns:
56
+ exclude_columns = []
57
+ df = TransformJsonNormalize._normalize(context.data, exclude_columns=cast(list, exclude_columns))
58
+ return context.from_existing(data=df)
59
+
60
+ @staticmethod
61
+ def _normalize(df, exclude_columns):
62
+ """Recursively normalizes the given DataFrame by exploding arrays and flattening structs.
63
+
64
+ This method performs two primary operations:
65
+ 1. Explodes any array columns, unless they are in the list of excluded columns.
66
+ 2. Flattens any struct columns, renaming nested fields and appending them to the top-level DataFrame.
67
+
68
+ The method continues these operations in a loop until there are no array or struct columns left.
69
+
70
+ Args:
71
+ df: The input DataFrame to normalize.
72
+ exclude_columns: A list of column names to exclude from the normalization process. These columns
73
+ will not be exploded or flattened.
74
+
75
+ Returns:
76
+ pyspark.sql.DataFrame: The normalized DataFrame with no array or struct columns.
77
+ """
78
+
79
+ def explode_arrays(df, exclude_columns):
80
+ array_present = False
81
+ for col in df.columns:
82
+ if df.schema[col].dataType.typeName() == "array" and col not in exclude_columns:
83
+ df = df.withColumn(col, F.explode(col))
84
+ array_present = True
85
+ return df, array_present
86
+
87
+ def flatten_structs(df):
88
+ struct_present = False
89
+ struct_columns = [col for col in df.columns if df.schema[col].dataType.typeName() == "struct"]
90
+ for col in struct_columns:
91
+ df = df.select(F.col("*"), F.col(col + ".*"))
92
+ nested_columns = df.select(F.col(col + ".*")).schema.names
93
+ for nested_col in nested_columns:
94
+ df = df.withColumnRenamed(nested_col, f"{col}_{nested_col}")
95
+ df = df.drop(col)
96
+ struct_present = True
97
+ return df, struct_present
98
+
99
+ array_present = True
100
+ struct_present = True
101
+
102
+ while array_present or struct_present:
103
+ df, array_present = explode_arrays(df, exclude_columns)
104
+ df, struct_present = flatten_structs(df)
105
+
106
+ return df
@@ -0,0 +1,60 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+
6
+
7
+ class TransformRenameColumnsAction(PipelineAction):
8
+ """Renames the specified columns in the DataFrame.
9
+
10
+ This method updates the DataFrame in the provided context by renaming columns according
11
+ to the mapping defined in the `columns` dictionary, where each key represents an old column
12
+ name and its corresponding value represents the new column name.
13
+
14
+ Example:
15
+ ```yaml
16
+ Rename Column:
17
+ action: TRANSFORM_RENAME_COLUMNS
18
+ options:
19
+ columns:
20
+ a_very_long_column_name: shortname
21
+ ```
22
+ """
23
+
24
+ name: str = "TRANSFORM_RENAME_COLUMNS"
25
+
26
+ def run(
27
+ self,
28
+ context: PipelineContext,
29
+ *,
30
+ columns: dict[str, str] | None = None,
31
+ **_: Any,
32
+ ) -> PipelineContext:
33
+ """Renames the specified columns in the DataFrame.
34
+
35
+ Args:
36
+ context: Context in which this Action is executed.
37
+ columns: A dictionary where the key is the old column name
38
+ and the value is the new column name.
39
+
40
+ Raises:
41
+ ValueError: If no columns are provided.
42
+ ValueError: If the data from context is None.
43
+
44
+ Returns:
45
+ Context after the execution of this Action.
46
+ """
47
+ if not columns:
48
+ raise ValueError("No columns provided.")
49
+
50
+ if context.data is None:
51
+ raise ValueError("Data from the context is required for the operation.")
52
+
53
+ df = context.data
54
+
55
+ if isinstance(columns, dict):
56
+ df = df.withColumnsRenamed(columns)
57
+ else:
58
+ raise ValueError("'columns' should be a dict, like {'old_name_1':'new_name_1', 'old_name_2':'new_name_2'}")
59
+
60
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,59 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+
6
+
7
+ class TransformReplaceValuesAction(PipelineAction):
8
+ """Replaces specified values in the given DataFrame.
9
+
10
+ This method iterates over the specified `replace` dictionary, where each key is a column name
11
+ and each value is another dictionary containing old values as keys and new values as the corresponding
12
+ values. The method updates the DataFrame by replacing occurrences of the old values with the new ones
13
+ in the specified columns.
14
+
15
+ Example:
16
+ ```yaml
17
+ Replace Values:
18
+ action: TRANSFORM_REPLACE_VALUES
19
+ options:
20
+ replace:
21
+ empl_function:
22
+ sales_employee: seller
23
+ ```
24
+ """
25
+
26
+ name: str = "TRANSFORM_REPLACE_VALUES"
27
+
28
+ def run(
29
+ self,
30
+ context: PipelineContext,
31
+ *,
32
+ replace: dict[str, dict[str, str]] | None = None,
33
+ **_: Any,
34
+ ) -> PipelineContext:
35
+ """Replaces specified values in the given DataFrame.
36
+
37
+ Args:
38
+ context: Context in which this Action is executed.
39
+ replace: A dictionary where each key is the column name
40
+ and the corresponding value is another dictionary mapping old values to new values.
41
+
42
+ Raises:
43
+ ValueError: If no replace values are provided.
44
+ ValueError: If the data from context is None.
45
+
46
+ Returns:
47
+ Context after the execution of this Action.
48
+ """
49
+ if not replace:
50
+ raise ValueError("No replace values provided.")
51
+
52
+ if context.data is None:
53
+ raise ValueError("Data from the context is required for the operation.")
54
+
55
+ df = context.data
56
+ for column, to_replace in replace.items():
57
+ df = df.replace(to_replace=to_replace, subset=[column]) # type: ignore
58
+
59
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,83 @@
1
+ from typing import Any
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from ..pipeline_context import PipelineContext
5
+
6
+
7
+ class TransformSelectColumnsAction(PipelineAction):
8
+ """Selects specified columns from the given DataFrame.
9
+
10
+ This method allows you to include or exclude specific columns from the
11
+ DataFrame. If `include_columns` is provided, only those columns will be
12
+ selected. If `exclude_columns` is provided, all columns except those will be
13
+ selected. The method ensures that the specified columns exist in the
14
+ DataFrame before performing the selection.
15
+
16
+ Example:
17
+ ```yaml
18
+ Select Columns:
19
+ action: TRANSFORM_SELECT_COLUMNS
20
+ options:
21
+ include_columns:
22
+ - id
23
+ - city
24
+ - product
25
+ ```
26
+ """
27
+
28
+ name: str = "TRANSFORM_SELECT_COLUMNS"
29
+
30
+ def run(
31
+ self,
32
+ context: PipelineContext,
33
+ *,
34
+ include_columns: list[str] | None = None,
35
+ exclude_columns: list[str] | None = None,
36
+ raise_on_non_existing_columns: bool = True,
37
+ **_: Any,
38
+ ) -> PipelineContext:
39
+ """Selects specified columns from the given DataFrame.
40
+
41
+ Args:
42
+ context: Context in which this Action is executed.
43
+ include_columns: A list of column names that should be included.
44
+ If provided, only these columns will be selected.
45
+ exclude_columns: A list of column names that should be excluded.
46
+ If provided, all columns except these will be selected.
47
+ raise_on_non_existing_columns: If True, raise an error if a specified
48
+ column is not found in the DataFrame. If False, ignore the column
49
+ and continue with the selection.
50
+
51
+ Raises:
52
+ ValueError: If a specified column is not found in the DataFrame.
53
+ ValueError: If neither include_columns nor exclude_columns are provided,
54
+ or if both are provided.
55
+
56
+ Returns:
57
+ Context after the execution of this Action.
58
+ """
59
+ if context.data is None:
60
+ raise ValueError("Data from the context is required for the operation.")
61
+
62
+ df = context.data
63
+
64
+ if (not include_columns and not exclude_columns) or (include_columns and exclude_columns):
65
+ raise ValueError("Please define either 'include_columns' or 'exclude_columns'.")
66
+
67
+ def check_missing_columns(df, columns, raise_on_non_existing_columns):
68
+ if raise_on_non_existing_columns:
69
+ missing_columns = [col for col in columns if col not in df.columns]
70
+ if missing_columns:
71
+ raise ValueError(f"Columns not found in DataFrame: {missing_columns}")
72
+
73
+ try:
74
+ if include_columns:
75
+ check_missing_columns(df, include_columns, raise_on_non_existing_columns)
76
+ df_selected = df.select(*include_columns)
77
+ elif exclude_columns:
78
+ check_missing_columns(df, exclude_columns, raise_on_non_existing_columns)
79
+ df_selected = df.drop(*exclude_columns)
80
+ except Exception as e:
81
+ raise ValueError(f"Column selection error: {e}") from e
82
+
83
+ return context.from_existing(data=df_selected) # type: ignore
@@ -0,0 +1,71 @@
1
+ from functools import reduce
2
+ from typing import Any
3
+
4
+ from pyspark.sql.dataframe import DataFrame
5
+
6
+ from ..pipeline_action import PipelineAction
7
+ from ..pipeline_context import PipelineContext
8
+ from ..pipeline_step import PipelineStep
9
+
10
+
11
+ class TransformUnionAction(PipelineAction):
12
+ """Unions multiple DataFrames together.
13
+
14
+ This method takes the current DataFrame from the context and unites it with
15
+ additional DataFrames specified in the `union_data` argument. All DataFrames
16
+ must have the same schema. If any DataFrame in `union_data` is None or
17
+ empty, a ValueError will be raised.
18
+
19
+ Example:
20
+ ```yaml
21
+ Union Tables:
22
+ action: TRANSFORM_UNION
23
+ options:
24
+ union_data:
25
+ - ((step: Filter First Table))
26
+ - ((step: SQL Transform Second Table))
27
+ ```
28
+ """
29
+
30
+ name: str = "TRANSFORM_UNION"
31
+
32
+ def run(
33
+ self,
34
+ context: PipelineContext,
35
+ *,
36
+ union_data: list[PipelineStep] | None = None,
37
+ **_: Any,
38
+ ) -> PipelineContext:
39
+ """Unions multiple DataFrames together.
40
+
41
+ Args:
42
+ context: Context in which this Action is executed.
43
+ union_data: A list of PipelineSteps that define the DataFrames
44
+ to union with the current context.
45
+
46
+ Raises:
47
+ ValueError: If no union_data is provided.
48
+ ValueError: If the data from context is None.
49
+ ValueError: If the data from any of the union_data is None.
50
+
51
+ Returns:
52
+ Context after the execution of this Action.
53
+ """
54
+ if not union_data:
55
+ raise ValueError("No union_data provided.")
56
+
57
+ # Check that all union_data contexts have valid data
58
+ result_contexts = []
59
+ if context.data is None:
60
+ raise ValueError("Data from the context is required for the operation.")
61
+
62
+ for ctx in union_data:
63
+ if ctx.result is None or ctx.result.data is None:
64
+ raise ValueError(f"Data from the context of step '{ctx.name}' is required for the operation.")
65
+ result_contexts.append(ctx.result.data)
66
+
67
+ # Union all DataFrames
68
+ union_dfs = [context.data] + result_contexts
69
+ df = reduce(DataFrame.unionAll, union_dfs) # type: ignore
70
+
71
+ return context.from_existing(data=df) # type: ignore