cloe-nessy 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. cloe_nessy/integration/reader/file_reader.py +7 -1
  2. cloe_nessy/integration/writer/catalog_writer.py +1 -1
  3. cloe_nessy/pipeline/actions/read_api.py +69 -45
  4. cloe_nessy/pipeline/actions/read_catalog_table.py +9 -9
  5. cloe_nessy/pipeline/actions/read_excel.py +14 -10
  6. cloe_nessy/pipeline/actions/read_files.py +54 -28
  7. cloe_nessy/pipeline/actions/read_metadata_yaml.py +9 -9
  8. cloe_nessy/pipeline/actions/transform_change_datatype.py +13 -8
  9. cloe_nessy/pipeline/actions/transform_clean_column_names.py +4 -0
  10. cloe_nessy/pipeline/actions/transform_concat_columns.py +25 -11
  11. cloe_nessy/pipeline/actions/transform_decode.py +18 -7
  12. cloe_nessy/pipeline/actions/transform_deduplication.py +9 -9
  13. cloe_nessy/pipeline/actions/transform_distinct.py +8 -8
  14. cloe_nessy/pipeline/actions/transform_filter.py +6 -6
  15. cloe_nessy/pipeline/actions/transform_generic_sql.py +12 -6
  16. cloe_nessy/pipeline/actions/transform_group_aggregate.py +20 -26
  17. cloe_nessy/pipeline/actions/transform_join.py +17 -10
  18. cloe_nessy/pipeline/actions/transform_json_normalize.py +19 -6
  19. cloe_nessy/pipeline/actions/transform_rename_columns.py +7 -7
  20. cloe_nessy/pipeline/actions/transform_replace_values.py +8 -8
  21. cloe_nessy/pipeline/actions/transform_select_columns.py +38 -9
  22. cloe_nessy/pipeline/actions/transform_union.py +12 -8
  23. cloe_nessy/pipeline/actions/write_catalog_table.py +11 -10
  24. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.5.dist-info}/METADATA +1 -1
  25. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.5.dist-info}/RECORD +27 -27
  26. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.5.dist-info}/WHEEL +1 -1
  27. {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.5.dist-info}/top_level.txt +0 -0
@@ -46,7 +46,13 @@ class FileReader(BaseReader):
46
46
  if not spark_format and not extension:
47
47
  raise ValueError("Either spark_format or extension must be provided.")
48
48
  self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
49
- extension_to_datatype_dict = {"csv": "csv", "json": "json", "parquet": "parquet", "txt": "text", "xml": "xml"}
49
+ extension_to_datatype_dict = {
50
+ "csv": "csv",
51
+ "json": "json",
52
+ "parquet": "parquet",
53
+ "txt": "text",
54
+ "xml": "xml",
55
+ }
50
56
 
51
57
  if extension and not spark_format:
52
58
  if extension not in extension_to_datatype_dict:
@@ -20,7 +20,7 @@ class CatalogWriter:
20
20
  format 'catalog.schema.table'.
21
21
  mode: The write mode. One of append, overwrite, error, errorifexists, ignore.
22
22
  partition_by: Names of the partitioning columns.
23
- options: All other string options.
23
+ options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
24
24
 
25
25
  Notes:
26
26
  append: Append contents of this DataFrame to existing data.
@@ -55,51 +55,75 @@ class ReadAPIAction(PipelineAction):
55
55
  DataFrame containing the response data.
56
56
 
57
57
  Example:
58
- ```yaml
59
- Read API:
60
- action: READ_API
61
- options:
62
- base_url: https://some_url.com/api/
63
- endpoint: my/endpoint/
64
- method: GET
65
- timeout: 90
66
- auth:
67
- - type: basic
68
- username: my_username
69
- password: my_password
70
- - type: secret_scope
71
- secret_scope: my_secret_scope
72
- header_template:
73
- "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
74
- - type: secret_scope
75
- secret_scope: my_secret_scope
76
- header_template:
77
- "header_key_2": "<SECRET_NAME>"
78
- - type: secret_scope
79
- secret_scope: my_other_secret_scope
80
- header_template:
81
- "header_key_3": "<SECRET_NAME>"
82
- - type: azure_oauth
83
- client_id: my_client_id
84
- client_secret: my_client_secret
85
- tenant_id: my_tenant_id
86
- scope: <entra-id-client-id>
87
- ```
88
-
89
- The above example will combine the headers from the different auth types. The resulting header will look like this:
90
-
91
- ```json
92
- {
93
- "header_key_1": "value_from_environment_variable",
94
- "header_key_2": "value_from_secret",
95
- "header_key_3": "value_from_secret",
96
- "Authorization": "Bearer <access_token> (from azure_oauth)",
97
- "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
98
- }
99
- ```
100
-
101
- !!! warning
102
-
58
+ === "Basic Usage"
59
+ ```yaml
60
+ Read API:
61
+ action: READ_API
62
+ options:
63
+ base_url: https://some_url.com/api/
64
+ endpoint: my/endpoint/
65
+ ```
66
+ === "Usage with Parameters and Headers"
67
+ ```yaml
68
+ Read API:
69
+ action: READ_API
70
+ options:
71
+ base_url: https://some_url.com/api/
72
+ endpoint: my/endpoint/
73
+ method: GET
74
+ timeout: 90
75
+ headers:
76
+ key1: value1
77
+ key2: value2
78
+ params:
79
+ key1: value1
80
+ key2: value2
81
+ ```
82
+ === "Usage with Authentication"
83
+ ```yaml
84
+ Read API:
85
+ action: READ_API
86
+ options:
87
+ base_url: https://some_url.com/api/
88
+ endpoint: my/endpoint/
89
+ method: GET
90
+ timeout: 90
91
+ auth:
92
+ - type: basic
93
+ username: my_username
94
+ password: my_password
95
+ - type: secret_scope
96
+ secret_scope: my_secret_scope
97
+ header_template:
98
+ "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
99
+ - type: secret_scope
100
+ secret_scope: my_secret_scope
101
+ header_template:
102
+ "header_key_2": "<SECRET_NAME>"
103
+ - type: secret_scope
104
+ secret_scope: my_other_secret_scope
105
+ header_template:
106
+ "header_key_3": "<SECRET_NAME>"
107
+ - type: azure_oauth
108
+ client_id: my_client_id
109
+ client_secret: my_client_secret
110
+ tenant_id: my_tenant_id
111
+ scope: <entra-id-client-id>
112
+ ```
113
+
114
+ The above example will combine the headers from the different auth types. The resulting header will look like this:
115
+
116
+ ```json
117
+ {
118
+ "header_key_1": "value_from_environment_variable",
119
+ "header_key_2": "value_from_secret",
120
+ "header_key_3": "value_from_secret",
121
+ "Authorization": "Bearer <access_token> (from azure_oauth)",
122
+ "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
123
+ }
124
+ ```
125
+
126
+ !!! warning "Secret information"
103
127
  Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
104
128
  Use secret scopes or environment variables instead.
105
129
  """
@@ -15,13 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
15
15
  into a DataFrame and returned as part of an updated `PipelineContext`.
16
16
 
17
17
  Example:
18
- ```yaml
19
- Read Sales Table:
20
- action: READ_CATALOG_TABLE
21
- options:
22
- table_identifier: my_catalog.business_schema.sales_table
23
- options: <options for the reader>
24
- ```
18
+ ```yaml
19
+ Read Sales Table:
20
+ action: READ_CATALOG_TABLE
21
+ options:
22
+ table_identifier: my_catalog.business_schema.sales_table
23
+ options: <options for the CatalogReader read method>
24
+ ```
25
25
  """
26
26
 
27
27
  name: str = "READ_CATALOG_TABLE"
@@ -43,8 +43,8 @@ class ReadCatalogTableAction(PipelineAction):
43
43
  read. If not provided, the function will attempt to use the table
44
44
  identifier from the `table_metadata` in the `context`.
45
45
  options: A dictionary of options for customizing
46
- the catalog reader's behavior, such as filters or reading modes. Defaults
47
- to None.
46
+ the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
47
+ behavior, such as filters or reading modes. Defaults to None.
48
48
 
49
49
  Raises:
50
50
  ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
@@ -21,16 +21,20 @@ class ReadExcelAction(PipelineAction):
21
21
  the read files can be included in the context.
22
22
 
23
23
  Example:
24
- ```yaml
25
- Read Excel Table:
26
- action: READ_EXCEL
27
- options:
28
- file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
29
- usecols:
30
- - key_column
31
- - interesting_column
32
- options: <more options for the reader>
33
- ```
24
+ ```yaml
25
+ Read Excel Table:
26
+ action: READ_EXCEL
27
+ options:
28
+ file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
29
+ usecols:
30
+ - key_column
31
+ - interesting_column
32
+ options: <options for the ExcelDataFrameReader read method>
33
+ ```
34
+
35
+ !!! note "More Options"
36
+ The `READ_EXCEL` action supports additional options that can be passed to the
37
+ run method. For more information, refer to the method documentation.
34
38
  """
35
39
 
36
40
  name: str = "READ_EXCEL"
@@ -14,14 +14,47 @@ class ReadFilesAction(PipelineAction):
14
14
  location will be read using a DataFrameReader with the specified format.
15
15
 
16
16
  Example:
17
- ```yaml
18
- Read Excel Table:
19
- action: READ_FILES
20
- options:
21
- location: excel_file_folder/excel_files_june/
22
- search_subdirs: True
23
- spark_format: AVRO
24
- ```
17
+ === "Read files specified by spark_format"
18
+ ```yaml
19
+ Read Files:
20
+ action: READ_FILES
21
+ options:
22
+ location: json_file_folder/
23
+ search_subdirs: True
24
+ spark_format: JSON
25
+ ```
26
+ !!! note "Define Spark Format"
27
+ Use the `spark_format` option to specify the format with which
28
+ to read the files. Supported formats are e.g., `CSV`, `JSON`,
29
+ `PARQUET`, `TEXT`, and `XML`.
30
+
31
+ === "Read files specified by extension"
32
+ ```yaml
33
+ Read Files:
34
+ action: READ_FILES
35
+ options:
36
+ location: csv_file_folder/
37
+ search_subdirs: True
38
+ extension: csv
39
+ ```
40
+ !!! note "Define Extension"
41
+ Use the `extension` option to specify the extension of the files
42
+ to read. If not specified, the `spark_format` will be derived from
43
+ the extension.
44
+
45
+ === "Read files with a specified spark_format AND extension"
46
+ ```yaml
47
+ Read Files:
48
+ action: READ_FILES
49
+ options:
50
+ location: file_folder/
51
+ extension: abc_custom_extension # specifies the files to read
52
+ spark_format: CSV # specifies the format to read the files with
53
+ ```
54
+ !!! note "Define both Extension & Spark Format"
55
+ Use the `extension` option to specify the extension of the files
56
+ to read. Additionally, use the `spark_format` option to specify
57
+ the format with which to read the files.
25
58
  """
26
59
 
27
60
  name: str = "READ_FILES"
@@ -47,7 +80,8 @@ class ReadFilesAction(PipelineAction):
47
80
  search_subdirs: Recursively search subdirectories for files
48
81
  if an extension is provided.
49
82
  extension: The file extension to filter files by.
50
- spark_format: The format to use for reading the files.
83
+ spark_format: The format to use for reading the files. If not provided,
84
+ it will be deferred from the file extension.
51
85
  schema: The schema of the data. If None, schema is obtained from
52
86
  the context metadata.
53
87
  add_metadata_column: Whether to include the `__metadata` column with
@@ -65,30 +99,22 @@ class ReadFilesAction(PipelineAction):
65
99
  raise ValueError("No location provided. Please specify location to read files from.")
66
100
  if not options:
67
101
  options = dict()
102
+ if not spark_format and not extension:
103
+ raise ValueError("Either spark_format or extension must be provided.")
68
104
 
69
105
  if (metadata := context.table_metadata) and schema is None:
70
106
  schema = metadata.schema
71
107
 
72
108
  file_reader = FileReader()
73
- if extension:
74
- df = file_reader.read(
75
- location=location,
76
- schema=schema,
77
- extension=extension,
78
- search_subdirs=search_subdirs,
79
- options=options,
80
- add_metadata_column=add_metadata_column,
81
- )
82
- elif spark_format:
83
- df = file_reader.read(
84
- location=location,
85
- schema=schema,
86
- spark_format=spark_format,
87
- options=options,
88
- add_metadata_column=add_metadata_column,
89
- )
90
- else:
91
- raise ValueError("Please provide either the 'extension' or 'spark_format'")
109
+ df = file_reader.read(
110
+ location=location,
111
+ schema=schema,
112
+ extension=extension,
113
+ spark_format=spark_format,
114
+ search_subdirs=search_subdirs,
115
+ options=options,
116
+ add_metadata_column=add_metadata_column,
117
+ )
92
118
 
93
119
  runtime_info = context.runtime_info
94
120
 
@@ -10,14 +10,14 @@ class ReadMetadataYAMLAction(PipelineAction):
10
10
  """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Read Schema Metadata:
15
- action: READ_METADATA_YAML_ACTION
16
- options:
17
- path: excel_file_folder/excel_files_june/
18
- file_name: sales_schema.yml
19
- table_name: sales
20
- ```
13
+ ```yaml
14
+ Read Schema Metadata:
15
+ action: READ_METADATA_YAML_ACTION
16
+ options:
17
+ path: excel_file_folder/excel_files_june/
18
+ file_name: sales_schema.yml
19
+ table_name: sales
20
+ ```
21
21
  """
22
22
 
23
23
  name: str = "READ_METADATA_YAML_ACTION"
@@ -31,7 +31,7 @@ class ReadMetadataYAMLAction(PipelineAction):
31
31
  table_name: str | None = None,
32
32
  **_: Any,
33
33
  ) -> PipelineContext:
34
- """Reads schema metadata from a yaml file using the `Schema` model.
34
+ """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
35
35
 
36
36
  Args:
37
37
  context: The context in which this Action is executed.
@@ -9,15 +9,20 @@ from ..pipeline_context import PipelineContext
9
9
  class TransformChangeDatatypeAction(PipelineAction):
10
10
  """Changes the datatypes of specified columns in the given DataFrame.
11
11
 
12
+ !!! note "Data Types"
13
+ We make use of the PySpark `cast` function to change the data types of
14
+ the columns. Valid data types can be found in the [PySpark
15
+ documentation](https://spark.apache.org/docs/3.5.3/sql-ref-datatypes.html).
16
+
12
17
  Example:
13
- ```yaml
14
- Transform Columns:
15
- action: TRANSFORM_CHANGE_DATATYPE
16
- options:
17
- columns:
18
- id: string
19
- revenue: long
20
- ```
18
+ ```yaml
19
+ Cast Columns:
20
+ action: TRANSFORM_CHANGE_DATATYPE
21
+ options:
22
+ columns:
23
+ id: string
24
+ revenue: long
25
+ ```
21
26
  """
22
27
 
23
28
  name: str = "TRANSFORM_CHANGE_DATATYPE"
@@ -15,6 +15,10 @@ class TransformCleanColumnNamesAction(PipelineAction):
15
15
  Removes invalid characters from the column names, including the fields of a struct and
16
16
  replaces a single leading underscore by a double underscore.
17
17
 
18
+ Invalid characters include:
19
+ - Any non-word character (anything other than letters, digits, and underscores).
20
+ - A single leading underscore.
21
+
18
22
  Example:
19
23
  ```yaml
20
24
  Clean Column Names:
@@ -10,17 +10,31 @@ class TransformConcatColumnsAction(PipelineAction):
10
10
  """Concatenates the specified columns in the given DataFrame.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Concat Columns:
15
- action: TRANSFORM_CONCAT_COLUMNS
16
- options:
17
- name: address
18
- columns:
19
- - street
20
- - postcode
21
- - country
22
- separator: ', '
23
- ```
13
+ === "concat with separator"
14
+ ```yaml
15
+ Concat Columns:
16
+ action: TRANSFORM_CONCAT_COLUMNS
17
+ options:
18
+ name: address
19
+ columns:
20
+ - street
21
+ - postcode
22
+ - country
23
+ separator: ', '
24
+ ```
25
+ === "concat without separator"
26
+ ```yaml
27
+ Concat Column:
28
+ action: TRANSFORM_CONCAT_COLUMNS
29
+ options:
30
+ name: address
31
+ columns:
32
+ - street
33
+ - postcode
34
+ - country
35
+ ```
36
+ !!! warning "beware of null handling"
37
+ The `separator` option is not provided, so the default behavior is to use `concat` which returns `NULL` if any of the concatenated values is `NULL`.
24
38
  """
25
39
 
26
40
  name: str = "TRANSFORM_CONCAT_COLUMNS"
@@ -11,13 +11,24 @@ class TransformDecodeAction(PipelineAction):
11
11
  """Decodes values of a specified column in the DataFrame based on the given format.
12
12
 
13
13
  Example:
14
- ```yaml
15
- Decode Columns:
16
- action: TRANSFORM_DECODE
17
- options:
18
- column: configurations
19
- input_format: json
20
- ```
14
+ === "Decode JSON column"
15
+ ```yaml
16
+ Expand JSON:
17
+ action: "TRANSFORM_DECODE"
18
+ options:
19
+ column: "data"
20
+ input_format: "json"
21
+ schema: "quality INT, timestamp TIMESTAMP, value DOUBLE"
22
+ ```
23
+ === "Decode base64 column"
24
+ ```yaml
25
+ Decode base64:
26
+ action: TRANSFORM_DECODE
27
+ options:
28
+ column: encoded_data
29
+ input_format: base64
30
+ schema: string
31
+ ```
21
32
  """
22
33
 
23
34
  name: str = "TRANSFORM_DECODE"
@@ -18,15 +18,15 @@ class TransformDeduplication(PipelineAction):
18
18
  (can be changed to lowest by setting the parameter descending to false).
19
19
 
20
20
  Example:
21
- ```yaml
22
- Deduplicate Columns:
23
- action: TRANSFORM_DEDUPLICATION
24
- options:
25
- key_columns:
26
- - id
27
- order_by_columns:
28
- - source_file_modification_time
29
- ```
21
+ ```yaml
22
+ Deduplicate Columns:
23
+ action: TRANSFORM_DEDUPLICATION
24
+ options:
25
+ key_columns:
26
+ - id
27
+ order_by_columns:
28
+ - source_file_modification_time
29
+ ```
30
30
  """
31
31
 
32
32
  name: str = "TRANSFORM_DEDUPLICATION"
@@ -10,14 +10,14 @@ class TransformDistinctAction(PipelineAction):
10
10
  If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Decode Columns:
15
- action: TRANSFORM_DISTINCT
16
- options:
17
- subset:
18
- - first_name
19
- - last_name
20
- ```
13
+ ```yaml
14
+ Distinct Columns:
15
+ action: TRANSFORM_DISTINCT
16
+ options:
17
+ subset:
18
+ - first_name
19
+ - last_name
20
+ ```
21
21
  """
22
22
 
23
23
  name: str = "TRANSFORM_DISTINCT"
@@ -8,12 +8,12 @@ class TransformFilterAction(PipelineAction):
8
8
  """Filters the DataFrame in the given context based on a specified condition.
9
9
 
10
10
  Example:
11
- ```yaml
12
- Decode Columns:
13
- action: TRANSFORM_FILTER
14
- options:
15
- condition: where city="Hamburg"
16
- ```
11
+ ```yaml
12
+ Filter Columns:
13
+ action: TRANSFORM_FILTER
14
+ options:
15
+ condition: city="Hamburg"
16
+ ```
17
17
  """
18
18
 
19
19
  name: str = "TRANSFORM_FILTER"
@@ -13,12 +13,18 @@ class TransformSqlAction(PipelineAction):
13
13
  statement is executed on that view. The resulting DataFrame is returned.
14
14
 
15
15
  Example:
16
- ```yaml
17
- SQL Transform:
18
- action: TRANSFORM_SQL
19
- options:
20
- sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
21
- ```
16
+ ```yaml
17
+ SQL Transform:
18
+ action: TRANSFORM_SQL
19
+ options:
20
+ sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
21
+ ```
22
+ !!! note
23
+ The SQL statement should reference the DataFrame as "{DATA_FRAME}".
24
+ This nessy specific placeholder will be replaced with your input
25
+ DataFrame from the context. If your pipeline is defined as an
26
+ f-string, you can escape the curly braces by doubling them, e.g.,
27
+ "{{DATA_FRAME}}".
22
28
  """
23
29
 
24
30
  name: str = "TRANSFORM_SQL"
@@ -13,33 +13,27 @@ class TransformGroupAggregate(PipelineAction):
13
13
  to other columns. The aggregation functions can be specified as a dictionary where keys are column names
14
14
  and values are either a single aggregation function or a list of functions.
15
15
 
16
+ The output DataFrame will contain the grouped columns and the aggregated columns with the aggregation
17
+ function as a prefix to the column name.
18
+
16
19
  Example:
17
- ```yaml
18
- Transform Group Aggregate:
19
- action: TRANSFORM_GROUP_AGGREGATE
20
- options:
21
- grouping_columns:
22
- - column1
23
- - column2
24
- aggregations:
25
- column3:
26
- - sum
27
- - avg
28
- column4: max
29
- ```
30
-
31
- Attributes:
32
- name (str): The name of the action, default is "TRANSFORM_GROUP_AGGREGATE".
33
-
34
- Methods:
35
- run(context, grouping_columns=None, aggregations=None, **_):
36
- Executes the aggregation on the grouped data.
37
-
38
- Raises:
39
- ValueError: If the context data is None.
40
- ValueError: If no aggregations are provided.
41
- ValueError: If invalid aggregation operations are provided.
42
- ValueError: If columns with unsupported data types are included in the aggregations.
20
+ ```yaml
21
+ Transform Group Aggregate:
22
+ action: TRANSFORM_GROUP_AGGREGATE
23
+ options:
24
+ grouping_columns:
25
+ - column1
26
+ - column2
27
+ aggregations:
28
+ column3:
29
+ - sum
30
+ - avg
31
+ column4: max
32
+ ```
33
+
34
+ This example groups the DataFrame by `column1` and `column2` and aggregates `column3` by sum and average
35
+ and `column4` by max. The resulting DataFrame will contain the grouped columns `column1` and `column2`
36
+ and the aggregated columns `sum_column3`, `avg_column3`, and `max_column4`.
43
37
  """
44
38
 
45
39
  name: str = "TRANSFORM_GROUP_AGGREGATE"
@@ -8,18 +8,25 @@ from ..pipeline_step import PipelineStep
8
8
  class TransformJoinAction(PipelineAction):
9
9
  """Joins the current DataFrame with another DataFrame defined in joined_data.
10
10
 
11
- The join operation is performed based on specified columns and the type of join
12
- indicated by the `how` parameter.
11
+ The join operation is performed based on specified columns and the type of
12
+ join indicated by the `how` parameter. Supported join types can be taken
13
+ from [PySpark
14
+ documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
13
15
 
14
16
  Example:
15
- ```yaml
16
- Join Tables:
17
- action: TRANSFORM_JOIN
18
- options:
19
- joined_data: ((step:Transform First Table))
20
- join_on: id
21
- how: anti
22
- ```
17
+ ```yaml
18
+ Join Tables:
19
+ action: TRANSFORM_JOIN
20
+ options:
21
+ joined_data: ((step:Transform First Table))
22
+ join_on: id
23
+ how: anti
24
+ ```
25
+
26
+ !!! note "Referencing a DataFrame from another step"
27
+ The `joined_data` parameter is a reference to the DataFrame from another step.
28
+ The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
29
+ for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
23
30
  """
24
31
 
25
32
  name: str = "TRANSFORM_JOIN"
@@ -14,12 +14,25 @@ class TransformJsonNormalize(PipelineAction):
14
14
  structs are appended after existing columns.
15
15
 
16
16
  Example:
17
- ```yaml
18
- Normalize Tables:
19
- action: TRANSFORM_JSON_NORMALIZE
20
- options:
21
- exclude_columns: coordinates
22
- ```
17
+ ```yaml
18
+ Normalize Tables:
19
+ action: TRANSFORM_JSON_NORMALIZE
20
+ options:
21
+ exclude_columns: coordinates
22
+ ```
23
+ Example Input Data:
24
+
25
+ | id | name | coordinates | attributes |
26
+ |----|--------|----------------------|---------------------------|
27
+ | 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
28
+ | 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
29
+
30
+ Example Output Data:
31
+
32
+ | id | name | coordinates | attributes_age | attributes_city |
33
+ |----|--------|-------------|----------------|-----------------|
34
+ | 1 | Alice | [10.0, 20.0]| 30 | NY |
35
+ | 2 | Bob | [30.0, 40.0]| 25 | LA |
23
36
  """
24
37
 
25
38
  name: str = "TRANSFORM_JSON_NORMALIZE"
@@ -12,13 +12,13 @@ class TransformRenameColumnsAction(PipelineAction):
12
12
  name and its corresponding value represents the new column name.
13
13
 
14
14
  Example:
15
- ```yaml
16
- Rename Column:
17
- action: TRANSFORM_RENAME_COLUMNS
18
- options:
19
- columns:
20
- a_very_long_column_name: shortname
21
- ```
15
+ ```yaml
16
+ Rename Column:
17
+ action: TRANSFORM_RENAME_COLUMNS
18
+ options:
19
+ columns:
20
+ a_very_long_column_name: shortname
21
+ ```
22
22
  """
23
23
 
24
24
  name: str = "TRANSFORM_RENAME_COLUMNS"
@@ -13,14 +13,14 @@ class TransformReplaceValuesAction(PipelineAction):
13
13
  in the specified columns.
14
14
 
15
15
  Example:
16
- ```yaml
17
- Replace Values:
18
- action: TRANSFORM_REPLACE_VALUES
19
- options:
20
- replace:
21
- empl_function:
22
- sales_employee: seller
23
- ```
16
+ ```yaml
17
+ Replace Values:
18
+ action: TRANSFORM_REPLACE_VALUES
19
+ options:
20
+ replace:
21
+ empl_function:
22
+ sales_employee: seller
23
+ ```
24
24
  """
25
25
 
26
26
  name: str = "TRANSFORM_REPLACE_VALUES"
@@ -14,15 +14,44 @@ class TransformSelectColumnsAction(PipelineAction):
14
14
  DataFrame before performing the selection.
15
15
 
16
16
  Example:
17
- ```yaml
18
- Select Columns:
19
- action: TRANSFORM_SELECT_COLUMNS
20
- options:
21
- include_columns:
22
- - id
23
- - city
24
- - product
25
- ```
17
+ Example Input Data:
18
+
19
+ | id | name | coordinates | attributes |
20
+ |----|--------|----------------------|---------------------------|
21
+ | 1 | Alice | [10.0, 20.0] | {"age": 30, "city": "NY"} |
22
+ | 2 | Bob | [30.0, 40.0] | {"age": 25, "city": "LA"} |
23
+ === "Include Columns"
24
+ ```yaml
25
+ Select Columns:
26
+ action: TRANSFORM_SELECT_COLUMNS
27
+ options:
28
+ include_columns:
29
+ - id
30
+ - name
31
+ - coordinates
32
+ ```
33
+ Example Output Data:
34
+
35
+ | id | name | coordinates |
36
+ |----|--------|----------------------|
37
+ | 1 | Alice | [10.0, 20.0] |
38
+ | 2 | Bob | [30.0, 40.0] |
39
+
40
+ === "Exclude Columns"
41
+ ```yaml
42
+ Select Columns:
43
+ action: TRANSFORM_SELECT_COLUMNS
44
+ options:
45
+ exclude_columns:
46
+ - coordinates
47
+ ```
48
+ Example Output Data:
49
+
50
+ | id | name | attributes |
51
+ |----|--------|---------------------------|
52
+ | 1 | Alice | {"age": 30, "city": "NY"} |
53
+ | 2 | Bob | {"age": 25, "city": "LA"} |
54
+
26
55
  """
27
56
 
28
57
  name: str = "TRANSFORM_SELECT_COLUMNS"
@@ -17,14 +17,18 @@ class TransformUnionAction(PipelineAction):
17
17
  empty, a ValueError will be raised.
18
18
 
19
19
  Example:
20
- ```yaml
21
- Union Tables:
22
- action: TRANSFORM_UNION
23
- options:
24
- union_data:
25
- - ((step: Filter First Table))
26
- - ((step: SQL Transform Second Table))
27
- ```
20
+ ```yaml
21
+ Union Tables:
22
+ action: TRANSFORM_UNION
23
+ options:
24
+ union_data:
25
+ - ((step: Filter First Table))
26
+ - ((step: SQL Transform Second Table))
27
+ ```
28
+ !!! note "Referencing a DataFrame from another step"
29
+ The `union_data` parameter is a reference to the DataFrame from another step.
30
+ The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
31
+ for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
28
32
  """
29
33
 
30
34
  name: str = "TRANSFORM_UNION"
@@ -9,15 +9,16 @@ class WriteCatalogTableAction(PipelineAction):
9
9
  """Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
10
10
 
11
11
  Example:
12
- ```yaml
13
- Write Table to Catalog:
14
- action: WRITE_CATALOG_TABLE
15
- options:
16
- table_identifier: my_catalog.business_schema.sales_table
17
- mode: append
18
- partition_by: day
19
- options: <options for the writer>
20
- ```
12
+ ```yaml
13
+ Write Table to Catalog:
14
+ action: WRITE_CATALOG_TABLE
15
+ options:
16
+ table_identifier: my_catalog.business_schema.sales_table
17
+ mode: append
18
+ partition_by: day
19
+ options:
20
+ mergeSchema: true
21
+ ```
21
22
  """
22
23
 
23
24
  name: str = "WRITE_CATALOG_TABLE"
@@ -42,7 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
42
43
  mode: The write mode. One of 'append', 'overwrite', 'error',
43
44
  'errorifexists', or 'ignore'.
44
45
  partition_by: Names of the partitioning columns.
45
- options: Additional options for the write operation.
46
+ options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
46
47
 
47
48
  Raises:
48
49
  ValueError: If the table name is not specified or cannot be inferred from
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: cloe-nessy
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Your friendly datalake monster.
5
5
  Home-page: https://initions.com/
6
6
  Author: initions
@@ -22,10 +22,10 @@ cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFM
22
22
  cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
23
23
  cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
24
24
  cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
25
- cloe_nessy/integration/reader/file_reader.py,sha256=pkrW_N5avqQpqcZuIQgHw5CFf7DFpSuKvq88zPZPfyY,3879
25
+ cloe_nessy/integration/reader/file_reader.py,sha256=1os8pZIXAGTJBZjGREmHOTlZeabbikC7sDv5xn3bIjE,3950
26
26
  cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
27
27
  cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
28
- cloe_nessy/integration/writer/catalog_writer.py,sha256=49lDvYttUY79Ye_OMN2cji7lGJNNML4TTsjY7VvLVfc,2137
28
+ cloe_nessy/integration/writer/catalog_writer.py,sha256=Gb-hMdADgO_uUJ7mZPHBYyNme2qXsdFFnzwo7GcShHM,2192
29
29
  cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
30
30
  cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
31
31
  cloe_nessy/models/__init__.py,sha256=_JPN_R5-QDfjYzvrvZDdeOezl0C-JTG-Rk4S1VE5vJM,242
@@ -48,34 +48,34 @@ cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0Y
48
48
  cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
49
49
  cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
50
50
  cloe_nessy/pipeline/actions/__init__.py,sha256=LwKctXy4Jun52BnCVGvWa8nnKVjTSov4GT58j6Zy8zg,2273
51
- cloe_nessy/pipeline/actions/read_api.py,sha256=wGyPZdeh3Cam_BQBilltWBWCIdD9I_kv4lunEhE39Tg,6625
52
- cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=aZy4sJLLE8ZQ_SPXGSDoHYaBJTz8s7xQDVn5eYrYHvE,2689
53
- cloe_nessy/pipeline/actions/read_excel.py,sha256=EgHbK1wO6dkDo0KErYDhK_2sNIkIoa-6As9oo9dNFsE,7708
54
- cloe_nessy/pipeline/actions/read_files.py,sha256=8twjprqKYEmVu5QITEGe4no45TfhgzZosTFVQ89vV6g,3861
55
- cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=aZtkstf9jBYYN2MGnazz63BG_hJ7mIgAfKiNqUpc26E,2235
56
- cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=Nz3Ncr-Zd-wy8g9-aN5XcvpWAHLyWs70RpZ7KqKqIaU,1788
57
- cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=XuVAVEbp-UiF8PO6wAEJyl1TYgBD7MSnuOGhuEvXKv4,2881
58
- cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=V0TzeQFpBYur_T1Nv0nRpOU02nKQ2iypo2CCcV2rBtk,3083
59
- cloe_nessy/pipeline/actions/transform_decode.py,sha256=DmT-29dIqbz_xTj4GSCfnbgYRCiUrWzKvGrRYy1frNw,4004
60
- cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=2VN5_wza7sD7fERyG6ElGh_Yo-W-Mxw-QBmtDXs1MGQ,5063
61
- cloe_nessy/pipeline/actions/transform_distinct.py,sha256=R0Wv_YnWOw198r0rPR_72fgH5sp7upgjZzfOPTZ1oPA,1942
62
- cloe_nessy/pipeline/actions/transform_filter.py,sha256=vOAxKtNWCABLb6G6Xz98NK7fEfgn6QJia31S7IvoUTg,1428
63
- cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=cli59HCERFge7f0RB8yXw2oDtHSbMCWQMdeCeqhbdg8,2355
64
- cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=HcY4sqb2yNBCz90jQtxGA8fZPuQXfJuaDmv8lWuoTqg,4050
65
- cloe_nessy/pipeline/actions/transform_join.py,sha256=qktyaN2kcCkmoH3RILTc-UGYsGACx1nXH6xLtuvYi7k,3080
66
- cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=xN_cQgHSMSyPsyYXBdoe2i5pHnyH-kkH5do8qr3vybw,4157
67
- cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=fFdg3353QCE3zBei6iYQW9huPBcQ906sJLioaOUWj3s,1924
68
- cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=-uOAbHkQZ2X23GB15W4-miAoHzyFH9hJyc6Y_5PA0w8,2017
69
- cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBEX-B-elKCvNPRU9ERSWs9afMGO8,3369
70
- cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
71
- cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
51
+ cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nuAHCuSaGs2s,7778
52
+ cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=-k2wezkv8bE_xwoW7WM1ORhrCXQagKTUuXkhI2ZEROs,2783
53
+ cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
54
+ cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
55
+ cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
56
+ cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
57
+ cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
58
+ cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
59
+ cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
60
+ cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
61
+ cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
62
+ cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1QeatjdEis0up4I7cOWBdyo,1446
63
+ cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
64
+ cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
65
+ cloe_nessy/pipeline/actions/transform_join.py,sha256=e_tvMk8YJTAWcUK_EmOgNt0s31ICZoMX_MKOTWx4lBY,3645
66
+ cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
67
+ cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
68
+ cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
69
+ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
70
+ cloe_nessy/pipeline/actions/transform_union.py,sha256=s81Vge0AbYPc7VkskCYfOQ_LEjqcmfNFyDkytfjcZyo,2720
71
+ cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
72
72
  cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
73
73
  cloe_nessy/session/session_manager.py,sha256=PK7awMc6fmot7f9FMmvIUbIzKFgjcy2o2bZS9kjVs10,6733
74
74
  cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
75
75
  cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
76
76
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
78
- cloe_nessy-0.3.3.dist-info/METADATA,sha256=StCfl2I5dItaMzO10u3gQw6WxfjZUZHRIodEvKuQu_s,1837
79
- cloe_nessy-0.3.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
80
- cloe_nessy-0.3.3.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
81
- cloe_nessy-0.3.3.dist-info/RECORD,,
78
+ cloe_nessy-0.3.5.dist-info/METADATA,sha256=UUx3aIUgvCLn7j3H4DbCL1k9-47HPKaANiMQsUj66wo,1837
79
+ cloe_nessy-0.3.5.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
80
+ cloe_nessy-0.3.5.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
81
+ cloe_nessy-0.3.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5