cloe-nessy 0.3.3__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +0 -0
  3. cloe_nessy/clients/api_client/__init__.py +0 -0
  4. cloe_nessy/clients/api_client/api_client.py +0 -0
  5. cloe_nessy/clients/api_client/api_response.py +0 -0
  6. cloe_nessy/clients/api_client/auth.py +0 -0
  7. cloe_nessy/clients/api_client/exceptions.py +0 -0
  8. cloe_nessy/file_utilities/__init__.py +0 -0
  9. cloe_nessy/file_utilities/exceptions.py +0 -0
  10. cloe_nessy/file_utilities/factory.py +0 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +0 -0
  12. cloe_nessy/file_utilities/location_types.py +0 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +0 -0
  20. cloe_nessy/integration/reader/api_reader.py +0 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +0 -0
  22. cloe_nessy/integration/reader/excel_reader.py +0 -0
  23. cloe_nessy/integration/reader/exceptions.py +0 -0
  24. cloe_nessy/integration/reader/file_reader.py +7 -1
  25. cloe_nessy/integration/reader/reader.py +0 -0
  26. cloe_nessy/integration/writer/__init__.py +0 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +1 -1
  28. cloe_nessy/logging/__init__.py +0 -0
  29. cloe_nessy/logging/logger_mixin.py +0 -0
  30. cloe_nessy/models/__init__.py +4 -0
  31. cloe_nessy/models/adapter/__init__.py +3 -0
  32. cloe_nessy/models/adapter/unity_catalog_adapter.py +292 -0
  33. cloe_nessy/models/catalog.py +10 -0
  34. cloe_nessy/models/column.py +0 -0
  35. cloe_nessy/models/constraint.py +0 -0
  36. cloe_nessy/models/foreign_key.py +0 -0
  37. cloe_nessy/models/mixins/__init__.py +0 -0
  38. cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
  39. cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
  40. cloe_nessy/models/schema.py +19 -0
  41. cloe_nessy/models/table.py +50 -5
  42. cloe_nessy/models/types.py +0 -0
  43. cloe_nessy/models/volume.py +67 -0
  44. cloe_nessy/object_manager/__init__.py +7 -2
  45. cloe_nessy/object_manager/table_manager.py +183 -7
  46. cloe_nessy/object_manager/volume_manager.py +70 -0
  47. cloe_nessy/pipeline/__init__.py +0 -0
  48. cloe_nessy/pipeline/actions/__init__.py +2 -0
  49. cloe_nessy/pipeline/actions/read_api.py +69 -45
  50. cloe_nessy/pipeline/actions/read_catalog_table.py +9 -9
  51. cloe_nessy/pipeline/actions/read_excel.py +14 -10
  52. cloe_nessy/pipeline/actions/read_files.py +54 -28
  53. cloe_nessy/pipeline/actions/read_metadata_yaml.py +9 -9
  54. cloe_nessy/pipeline/actions/transform_change_datatype.py +13 -8
  55. cloe_nessy/pipeline/actions/transform_clean_column_names.py +4 -0
  56. cloe_nessy/pipeline/actions/transform_concat_columns.py +25 -11
  57. cloe_nessy/pipeline/actions/transform_decode.py +18 -7
  58. cloe_nessy/pipeline/actions/transform_deduplication.py +9 -9
  59. cloe_nessy/pipeline/actions/transform_distinct.py +8 -8
  60. cloe_nessy/pipeline/actions/transform_filter.py +6 -6
  61. cloe_nessy/pipeline/actions/transform_generic_sql.py +12 -6
  62. cloe_nessy/pipeline/actions/transform_group_aggregate.py +20 -26
  63. cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
  64. cloe_nessy/pipeline/actions/transform_join.py +17 -10
  65. cloe_nessy/pipeline/actions/transform_json_normalize.py +19 -6
  66. cloe_nessy/pipeline/actions/transform_rename_columns.py +7 -7
  67. cloe_nessy/pipeline/actions/transform_replace_values.py +8 -8
  68. cloe_nessy/pipeline/actions/transform_select_columns.py +38 -9
  69. cloe_nessy/pipeline/actions/transform_union.py +12 -8
  70. cloe_nessy/pipeline/actions/write_catalog_table.py +11 -10
  71. cloe_nessy/pipeline/pipeline.py +44 -2
  72. cloe_nessy/pipeline/pipeline_action.py +0 -0
  73. cloe_nessy/pipeline/pipeline_config.py +0 -0
  74. cloe_nessy/pipeline/pipeline_context.py +0 -0
  75. cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
  76. cloe_nessy/pipeline/pipeline_step.py +0 -0
  77. cloe_nessy/py.typed +0 -0
  78. cloe_nessy/session/__init__.py +0 -0
  79. cloe_nessy/session/session_manager.py +27 -0
  80. cloe_nessy/settings/__init__.py +0 -0
  81. cloe_nessy/settings/settings.py +0 -0
  82. cloe_nessy/utils/__init__.py +0 -0
  83. cloe_nessy/utils/file_and_directory_handler.py +0 -0
  84. cloe_nessy-0.3.8.dist-info/METADATA +46 -0
  85. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/RECORD +41 -35
  86. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/WHEEL +1 -1
  87. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/top_level.txt +0 -0
  88. cloe_nessy-0.3.3.dist-info/METADATA +0 -26
@@ -10,17 +10,31 @@ class TransformConcatColumnsAction(PipelineAction):
10
10
  """Concatenates the specified columns in the given DataFrame.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Concat Columns:
15
- action: TRANSFORM_CONCAT_COLUMNS
16
- options:
17
- name: address
18
- columns:
19
- - street
20
- - postcode
21
- - country
22
- separator: ', '
23
- ```
13
+ === "concat with separator"
14
+ ```yaml
15
+ Concat Columns:
16
+ action: TRANSFORM_CONCAT_COLUMNS
17
+ options:
18
+ name: address
19
+ columns:
20
+ - street
21
+ - postcode
22
+ - country
23
+ separator: ', '
24
+ ```
25
+ === "concat without separator"
26
+ ```yaml
27
+ Concat Column:
28
+ action: TRANSFORM_CONCAT_COLUMNS
29
+ options:
30
+ name: address
31
+ columns:
32
+ - street
33
+ - postcode
34
+ - country
35
+ ```
36
+ !!! warning "beware of null handling"
37
+ The `separator` option is not provided, so the default behavior is to use `concat` which returns `NULL` if any of the concatenated values is `NULL`.
24
38
  """
25
39
 
26
40
  name: str = "TRANSFORM_CONCAT_COLUMNS"
@@ -11,13 +11,24 @@ class TransformDecodeAction(PipelineAction):
11
11
  """Decodes values of a specified column in the DataFrame based on the given format.
12
12
 
13
13
  Example:
14
- ```yaml
15
- Decode Columns:
16
- action: TRANSFORM_DECODE
17
- options:
18
- column: configurations
19
- input_format: json
20
- ```
14
+ === "Decode JSON column"
15
+ ```yaml
16
+ Expand JSON:
17
+ action: "TRANSFORM_DECODE"
18
+ options:
19
+ column: "data"
20
+ input_format: "json"
21
+ schema: "quality INT, timestamp TIMESTAMP, value DOUBLE"
22
+ ```
23
+ === "Decode base64 column"
24
+ ```yaml
25
+ Decode base64:
26
+ action: TRANSFORM_DECODE
27
+ options:
28
+ column: encoded_data
29
+ input_format: base64
30
+ schema: string
31
+ ```
21
32
  """
22
33
 
23
34
  name: str = "TRANSFORM_DECODE"
@@ -18,15 +18,15 @@ class TransformDeduplication(PipelineAction):
18
18
  (can be changed to lowest by setting the parameter descending to false).
19
19
 
20
20
  Example:
21
- ```yaml
22
- Deduplicate Columns:
23
- action: TRANSFORM_DEDUPLICATION
24
- options:
25
- key_columns:
26
- - id
27
- order_by_columns:
28
- - source_file_modification_time
29
- ```
21
+ ```yaml
22
+ Deduplicate Columns:
23
+ action: TRANSFORM_DEDUPLICATION
24
+ options:
25
+ key_columns:
26
+ - id
27
+ order_by_columns:
28
+ - source_file_modification_time
29
+ ```
30
30
  """
31
31
 
32
32
  name: str = "TRANSFORM_DEDUPLICATION"
@@ -10,14 +10,14 @@ class TransformDistinctAction(PipelineAction):
10
10
  If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Decode Columns:
15
- action: TRANSFORM_DISTINCT
16
- options:
17
- subset:
18
- - first_name
19
- - last_name
20
- ```
13
+ ```yaml
14
+ Distinct Columns:
15
+ action: TRANSFORM_DISTINCT
16
+ options:
17
+ subset:
18
+ - first_name
19
+ - last_name
20
+ ```
21
21
  """
22
22
 
23
23
  name: str = "TRANSFORM_DISTINCT"
@@ -8,12 +8,12 @@ class TransformFilterAction(PipelineAction):
8
8
  """Filters the DataFrame in the given context based on a specified condition.
9
9
 
10
10
  Example:
11
- ```yaml
12
- Decode Columns:
13
- action: TRANSFORM_FILTER
14
- options:
15
- condition: where city="Hamburg"
16
- ```
11
+ ```yaml
12
+ Filter Columns:
13
+ action: TRANSFORM_FILTER
14
+ options:
15
+ condition: city="Hamburg"
16
+ ```
17
17
  """
18
18
 
19
19
  name: str = "TRANSFORM_FILTER"
@@ -13,12 +13,18 @@ class TransformSqlAction(PipelineAction):
13
13
  statement is executed on that view. The resulting DataFrame is returned.
14
14
 
15
15
  Example:
16
- ```yaml
17
- SQL Transform:
18
- action: TRANSFORM_SQL
19
- options:
20
- sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
21
- ```
16
+ ```yaml
17
+ SQL Transform:
18
+ action: TRANSFORM_SQL
19
+ options:
20
+ sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
21
+ ```
22
+ !!! note
23
+ The SQL statement should reference the DataFrame as "{DATA_FRAME}".
24
+ This nessy specific placeholder will be replaced with your input
25
+ DataFrame from the context. If your pipeline is defined as an
26
+ f-string, you can escape the curly braces by doubling them, e.g.,
27
+ "{{DATA_FRAME}}".
22
28
  """
23
29
 
24
30
  name: str = "TRANSFORM_SQL"
@@ -13,33 +13,27 @@ class TransformGroupAggregate(PipelineAction):
13
13
  to other columns. The aggregation functions can be specified as a dictionary where keys are column names
14
14
  and values are either a single aggregation function or a list of functions.
15
15
 
16
+ The output DataFrame will contain the grouped columns and the aggregated columns with the aggregation
17
+ function as a prefix to the column name.
18
+
16
19
  Example:
17
- ```yaml
18
- Transform Group Aggregate:
19
- action: TRANSFORM_GROUP_AGGREGATE
20
- options:
21
- grouping_columns:
22
- - column1
23
- - column2
24
- aggregations:
25
- column3:
26
- - sum
27
- - avg
28
- column4: max
29
- ```
30
-
31
- Attributes:
32
- name (str): The name of the action, default is "TRANSFORM_GROUP_AGGREGATE".
33
-
34
- Methods:
35
- run(context, grouping_columns=None, aggregations=None, **_):
36
- Executes the aggregation on the grouped data.
37
-
38
- Raises:
39
- ValueError: If the context data is None.
40
- ValueError: If no aggregations are provided.
41
- ValueError: If invalid aggregation operations are provided.
42
- ValueError: If columns with unsupported data types are included in the aggregations.
20
+ ```yaml
21
+ Transform Group Aggregate:
22
+ action: TRANSFORM_GROUP_AGGREGATE
23
+ options:
24
+ grouping_columns:
25
+ - column1
26
+ - column2
27
+ aggregations:
28
+ column3:
29
+ - sum
30
+ - avg
31
+ column4: max
32
+ ```
33
+
34
+ This example groups the DataFrame by `column1` and `column2` and aggregates `column3` by sum and average
35
+ and `column4` by max. The resulting DataFrame will contain the grouped columns `column1` and `column2`
36
+ and the aggregated columns `sum_column3`, `avg_column3`, and `max_column4`.
43
37
  """
44
38
 
45
39
  name: str = "TRANSFORM_GROUP_AGGREGATE"
@@ -0,0 +1,209 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from pyspark.sql import functions as F
5
+
6
+ from ..pipeline_action import PipelineAction
7
+ from ..pipeline_context import PipelineContext
8
+
9
+ SUPPORTED_ALGORITHMS = {"hash", "md5", "sha1", "sha2", "xxhash64", "crc32"}
10
+ VALID_SHA2_BITS = {224, 256, 384, 512}
11
+
12
+
13
+ class HashSettings(BaseModel):
14
+ """Represents the settings for hashing columns.
15
+
16
+ Attributes:
17
+ columns: List of column names to hash.
18
+ algorithm: Hashing algorithm to use. Must be one of
19
+ "hash", "md5", "sha1", "sha2", "xxhash64", or "crc32".
20
+ bits: Bit length for the 'sha2' algorithm. Optional.
21
+ """
22
+
23
+ columns: list[str]
24
+ algorithm: str = Field(..., description="Hashing algorithm to use")
25
+ bits: int | None = Field(default=None, description="Only required for sha2")
26
+
27
+ @model_validator(mode="before")
28
+ def validate_all(cls, values):
29
+ """Validates the input values for a hashing operation before model instantiation.
30
+
31
+ This method performs the following checks:
32
+
33
+ 1. Ensures the specified hashing algorithm is supported.
34
+ 2. Validates that at least one column is provided and that the columns parameter is a non-empty list.
35
+ 3. Checks that hashing multiple columns is only supported for the 'hash' and 'xxhash64' algorithms.
36
+ 4. For the 'sha2' algorithm, ensures that the 'bits' parameter is one of the valid options.
37
+ 5. Ensures that the 'bits' parameter is not provided for algorithms other than 'sha2'.
38
+
39
+ Raises:
40
+ ValueError: If the algorithm is unsupported, no columns are provided, the columns parameter is invalid,
41
+ or the 'bits' parameter is invalid for the specified algorithm.
42
+ NotImplementedError: If multiple columns are provided and the algorithm does not support hashing multiple columns.
43
+
44
+ Args:
45
+ cls: The class being validated.
46
+ values: A dictionary of input values containing 'algorithm', 'columns', and 'bits'.
47
+
48
+ Returns:
49
+ The validated input values.
50
+ """
51
+ algorithm = values.get("algorithm")
52
+ columns = values.get("columns")
53
+ bits = values.get("bits")
54
+
55
+ if algorithm not in SUPPORTED_ALGORITHMS:
56
+ raise ValueError(
57
+ f"Unsupported hashing algorithm '{algorithm}'. Supported algorithms are: {SUPPORTED_ALGORITHMS}."
58
+ )
59
+
60
+ if not columns or not isinstance(columns, list) or len(columns) == 0:
61
+ raise ValueError("At least one column must be provided.")
62
+
63
+ if len(columns) > 1 and algorithm not in {"hash", "xxhash64"}:
64
+ raise NotImplementedError(
65
+ f"Hashing multiple columns is only supported for 'hash' and 'xxhash64'. Algorithm '{algorithm}' does not support this."
66
+ )
67
+
68
+ if algorithm == "sha2":
69
+ if bits not in VALID_SHA2_BITS:
70
+ raise ValueError(f"'bits' must be one of {VALID_SHA2_BITS} when using 'sha2'.")
71
+ elif bits is not None:
72
+ raise ValueError("'bits' is only allowed when algorithm is 'sha2'.")
73
+
74
+ return values
75
+
76
+
77
+ class HashConfig(BaseModel):
78
+ """A configuration model for defining hash settings for specific columns.
79
+
80
+ Attributes:
81
+ hash_config: A dictionary where the keys are column names
82
+ (as strings) and the values are `HashSettings` objects that define
83
+ the hash settings for each column.
84
+
85
+ Methods:
86
+ validate_config: Validates the hash configuration to ensure it contains
87
+ at least one entry and that all column names are valid strings. Raises a
88
+ `ValueError` if the configuration is invalid.
89
+ """
90
+
91
+ hash_config: dict[str, HashSettings]
92
+
93
+ @model_validator(mode="before")
94
+ def validate_config(cls, values):
95
+ """Validates the hash configuration provided in the model.
96
+
97
+ This method is executed in "before" mode to ensure that the `hash_config`
98
+ field in the input values meets the required criteria:
99
+
100
+ - It must be a dictionary.
101
+ - It must contain at least one entry.
102
+ - Each key in the dictionary must be a non-empty string.
103
+
104
+ Raises:
105
+ ValueError: If `hash_config` is missing, not a dictionary, empty, or
106
+ contains invalid column names.
107
+
108
+ Args:
109
+ cls: The class to which this validator is applied.
110
+ values: The input values to validate.
111
+
112
+ Returns:
113
+ The validated input values.
114
+ """
115
+ config = values.get("hash_config")
116
+ if not config or not isinstance(config, dict) or len(config) == 0:
117
+ raise ValueError("Hash configuration must contain at least one entry.")
118
+ for new_col in config:
119
+ if not new_col or not isinstance(new_col, str):
120
+ raise ValueError(f"Invalid column name '{new_col}' in hash configuration.")
121
+ return values
122
+
123
+
124
+ class TransformHashColumnsAction(PipelineAction):
125
+ """Hashes specified columns in a DataFrame using a chosen algorithm.
126
+
127
+ Given the following `hash_config`:
128
+
129
+ Example:
130
+ ```yaml
131
+ Hash Columns:
132
+ action: TRANSFORM_HASH_COLUMNS
133
+ options:
134
+ hash_config:
135
+ - hashed_column1:
136
+ columns: ["column1", "column2"]
137
+ algorithm: "sha2"
138
+ bits: 224
139
+ - hashed_column2:
140
+ columns: ["column1"]
141
+ algorithm: "crc32"
142
+ ```
143
+
144
+ Given a DataFrame `df` with the following structure:
145
+
146
+ | column1 | column2 | column3 |
147
+ |---------|---------|---------|
148
+ | foo | bar | baz |
149
+
150
+ After running the action, the resulting DataFrame will look like:
151
+
152
+ | column1 | column2 | column3 | hashed_column1 | hashed_column2 |
153
+ |---------|---------|---------|-----------------------------------------------------------|----------------|
154
+ | foo | bar | baz | 17725b837e9c896e7123b142eb980131dcc0baa6160db45d4adfdb21 | 1670361220 |
155
+
156
+
157
+ !!! note "Hash values might vary"
158
+ The actual hash values will depend on the hashing algorithm used and the input data.
159
+ """
160
+
161
+ name: str = "TRANSFORM_HASH_COLUMNS"
162
+
163
+ def run(
164
+ self,
165
+ context: PipelineContext,
166
+ *,
167
+ hash_config: HashConfig | None = None,
168
+ **_: Any,
169
+ ) -> PipelineContext:
170
+ """Hashes the specified columns in the DataFrame.
171
+
172
+ Args:
173
+ context: Context in which this Action is executed.
174
+ hash_config: Dictionary that contains the configuration for executing the hashing.
175
+
176
+ Returns:
177
+ Updated PipelineContext with hashed columns.
178
+
179
+ Raises:
180
+ ValueError: If columns are missing, data is None, or algorithm/bits are invalid.
181
+ ValueError: If the hash configuration is invalid.
182
+ """
183
+ if context.data is None:
184
+ raise ValueError("Context data is required for hashing.")
185
+
186
+ if not hash_config:
187
+ raise ValueError("Hash configuration is required.")
188
+
189
+ df = context.data
190
+
191
+ hash_functions = {
192
+ "hash": lambda cols: F.hash(*[F.col(c) for c in cols]).cast("string"),
193
+ "xxhash64": lambda cols: F.xxhash64(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
194
+ "md5": lambda cols: F.md5(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
195
+ "sha1": lambda cols: F.sha1(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
196
+ "sha2": lambda cols, bits: F.sha2(F.concat_ws("||", *[F.col(c) for c in cols]), bits).cast("string"),
197
+ "crc32": lambda cols: F.crc32(F.concat_ws("||", *[F.col(c) for c in cols])).cast("string"),
198
+ }
199
+ default_sha2_bits = 256
200
+
201
+ config_obj = HashConfig.model_validate({"hash_config": hash_config})
202
+ for new_col, config in config_obj.hash_config.items():
203
+ hash_func = hash_functions[config.algorithm]
204
+ if config.algorithm == "sha2":
205
+ df = df.withColumn(new_col, hash_func(config.columns, config.bits or default_sha2_bits)) # type: ignore
206
+ else:
207
+ df = df.withColumn(new_col, hash_func(config.columns)) # type: ignore
208
+
209
+ return context.from_existing(data=df)
@@ -8,18 +8,25 @@ from ..pipeline_step import PipelineStep
8
8
  class TransformJoinAction(PipelineAction):
9
9
  """Joins the current DataFrame with another DataFrame defined in joined_data.
10
10
 
11
- The join operation is performed based on specified columns and the type of join
12
- indicated by the `how` parameter.
11
+ The join operation is performed based on specified columns and the type of
12
+ join indicated by the `how` parameter. Supported join types can be taken
13
+ from [PySpark
14
+ documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
13
15
 
14
16
  Example:
15
- ```yaml
16
- Join Tables:
17
- action: TRANSFORM_JOIN
18
- options:
19
- joined_data: ((step:Transform First Table))
20
- join_on: id
21
- how: anti
22
- ```
17
+ ```yaml
18
+ Join Tables:
19
+ action: TRANSFORM_JOIN
20
+ options:
21
+ joined_data: ((step:Transform First Table))
22
+ join_on: id
23
+ how: anti
24
+ ```
25
+
26
+ !!! note "Referencing a DataFrame from another step"
27
+ The `joined_data` parameter is a reference to the DataFrame from another step.
28
+ The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
29
+ for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
23
30
  """
24
31
 
25
32
  name: str = "TRANSFORM_JOIN"
@@ -14,12 +14,25 @@ class TransformJsonNormalize(PipelineAction):
14
14
  structs are appended after existing columns.
15
15
 
16
16
  Example:
17
- ```yaml
18
- Normalize Tables:
19
- action: TRANSFORM_JSON_NORMALIZE
20
- options:
21
- exclude_columns: coordinates
22
- ```
17
+ ```yaml
18
+ Normalize Tables:
19
+ action: TRANSFORM_JSON_NORMALIZE
20
+ options:
21
+ exclude_columns: coordinates
22
+ ```
23
+ Example Input Data:
24
+
25
+ | id | name | coordinates | attributes |
26
+ |----|--------|----------------------|---------------------------|
27
+ | 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
28
+ | 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
29
+
30
+ Example Output Data:
31
+
32
+ | id | name | coordinates | attributes_age | attributes_city |
33
+ |----|--------|-------------|----------------|-----------------|
34
+ | 1 | Alice | [10.0, 20.0]| 30 | NY |
35
+ | 2 | Bob | [30.0, 40.0]| 25 | LA |
23
36
  """
24
37
 
25
38
  name: str = "TRANSFORM_JSON_NORMALIZE"
@@ -12,13 +12,13 @@ class TransformRenameColumnsAction(PipelineAction):
12
12
  name and its corresponding value represents the new column name.
13
13
 
14
14
  Example:
15
- ```yaml
16
- Rename Column:
17
- action: TRANSFORM_RENAME_COLUMNS
18
- options:
19
- columns:
20
- a_very_long_column_name: shortname
21
- ```
15
+ ```yaml
16
+ Rename Column:
17
+ action: TRANSFORM_RENAME_COLUMNS
18
+ options:
19
+ columns:
20
+ a_very_long_column_name: shortname
21
+ ```
22
22
  """
23
23
 
24
24
  name: str = "TRANSFORM_RENAME_COLUMNS"
@@ -13,14 +13,14 @@ class TransformReplaceValuesAction(PipelineAction):
13
13
  in the specified columns.
14
14
 
15
15
  Example:
16
- ```yaml
17
- Replace Values:
18
- action: TRANSFORM_REPLACE_VALUES
19
- options:
20
- replace:
21
- empl_function:
22
- sales_employee: seller
23
- ```
16
+ ```yaml
17
+ Replace Values:
18
+ action: TRANSFORM_REPLACE_VALUES
19
+ options:
20
+ replace:
21
+ empl_function:
22
+ sales_employee: seller
23
+ ```
24
24
  """
25
25
 
26
26
  name: str = "TRANSFORM_REPLACE_VALUES"
@@ -14,15 +14,44 @@ class TransformSelectColumnsAction(PipelineAction):
14
14
  DataFrame before performing the selection.
15
15
 
16
16
  Example:
17
- ```yaml
18
- Select Columns:
19
- action: TRANSFORM_SELECT_COLUMNS
20
- options:
21
- include_columns:
22
- - id
23
- - city
24
- - product
25
- ```
17
+ Example Input Data:
18
+
19
+ | id | name | coordinates | attributes |
20
+ |----|--------|----------------------|---------------------------|
21
+ | 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
22
+ | 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
23
+ === "Include Columns"
24
+ ```yaml
25
+ Select Columns:
26
+ action: TRANSFORM_SELECT_COLUMNS
27
+ options:
28
+ include_columns:
29
+ - id
30
+ - name
31
+ - coordinates
32
+ ```
33
+ Example Output Data:
34
+
35
+ | id | name | coordinates |
36
+ |----|--------|----------------------|
37
+ | 1 | Alice | [10.0, 20.0] |
38
+ | 2 | Bob | [30.0, 40.0] |
39
+
40
+ === "Exclude Columns"
41
+ ```yaml
42
+ Select Columns:
43
+ action: TRANSFORM_SELECT_COLUMNS
44
+ options:
45
+ exclude_columns:
46
+ - coordinates
47
+ ```
48
+ Example Output Data:
49
+
50
+ | id | name | attributes |
51
+ |----|--------|---------------------------|
52
+ | 1 | Alice | {"age": 30, "city": "NY"} |
53
+ | 2 | Bob | {"age": 25, "city": "LA"} |
54
+
26
55
  """
27
56
 
28
57
  name: str = "TRANSFORM_SELECT_COLUMNS"
@@ -17,14 +17,18 @@ class TransformUnionAction(PipelineAction):
17
17
  empty, a ValueError will be raised.
18
18
 
19
19
  Example:
20
- ```yaml
21
- Union Tables:
22
- action: TRANSFORM_UNION
23
- options:
24
- union_data:
25
- - ((step: Filter First Table))
26
- - ((step: SQL Transform Second Table))
27
- ```
20
+ ```yaml
21
+ Union Tables:
22
+ action: TRANSFORM_UNION
23
+ options:
24
+ union_data:
25
+ - ((step: Filter First Table))
26
+ - ((step: SQL Transform Second Table))
27
+ ```
28
+ !!! note "Referencing a DataFrame from another step"
29
+ The `union_data` parameter is a reference to the DataFrame from another step.
30
+ The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
31
+ for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
28
32
  """
29
33
 
30
34
  name: str = "TRANSFORM_UNION"
@@ -9,15 +9,16 @@ class WriteCatalogTableAction(PipelineAction):
9
9
  """Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
10
10
 
11
11
  Example:
12
- ```yaml
13
- Write Table to Catalog:
14
- action: WRITE_CATALOG_TABLE
15
- options:
16
- table_identifier: my_catalog.business_schema.sales_table
17
- mode: append
18
- partition_by: day
19
- options: <options for the writer>
20
- ```
12
+ ```yaml
13
+ Write Table to Catalog:
14
+ action: WRITE_CATALOG_TABLE
15
+ options:
16
+ table_identifier: my_catalog.business_schema.sales_table
17
+ mode: append
18
+ partition_by: day
19
+ options:
20
+ mergeSchema: true
21
+ ```
21
22
  """
22
23
 
23
24
  name: str = "WRITE_CATALOG_TABLE"
@@ -42,7 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
42
43
  mode: The write mode. One of 'append', 'overwrite', 'error',
43
44
  'errorifexists', or 'ignore'.
44
45
  partition_by: Names of the partitioning columns.
45
- options: Additional options for the write operation.
46
+ options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
46
47
 
47
48
  Raises:
48
49
  ValueError: If the table name is not specified or cannot be inferred from