PyPI - cloe-nessy - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

cloe-nessy 0.3.2py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

cloe_nessy/integration/reader/file_reader.py CHANGED Viewed

@@ -46,7 +46,13 @@ class FileReader(BaseReader):
         if not spark_format and not extension:
             raise ValueError("Either spark_format or extension must be provided.")
         self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
-        extension_to_datatype_dict = {"csv": "csv", "json": "json", "parquet": "parquet", "txt": "text", "xml": "xml"}
+        extension_to_datatype_dict = {
+            "csv": "csv",
+            "json": "json",
+            "parquet": "parquet",
+            "txt": "text",
+            "xml": "xml",
+        }
         if extension and not spark_format:
             if extension not in extension_to_datatype_dict:

cloe_nessy/integration/writer/catalog_writer.py CHANGED Viewed

@@ -20,7 +20,7 @@ class CatalogWriter:
                               format 'catalog.schema.table'.
             mode: The write mode. One of append, overwrite, error, errorifexists, ignore.
             partition_by: Names of the partitioning columns.
-            options: All other string options.
+            options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
         Notes:
             append: Append contents of this DataFrame to existing data.

cloe_nessy/pipeline/actions/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .read_excel import ReadExcelAction
 from .read_files import ReadFilesAction
 from .read_metadata_yaml import ReadMetadataYAMLAction
 from .transform_change_datatype import TransformChangeDatatypeAction
+from .transform_clean_column_names import TransformCleanColumnNamesAction
 from .transform_concat_columns import TransformConcatColumnsAction
 from .transform_decode import TransformDecodeAction
 from .transform_distinct import TransformDistinctAction
@@ -39,6 +40,7 @@ __all__ = [
     "TransformFilterAction",
     "TransformUnionAction",
     "TransformChangeDatatypeAction",
+    "TransformCleanColumnNamesAction",
     "TransformConcatColumnsAction",
     "TransformDecodeAction",
     "TransformDistinctAction",

cloe_nessy/pipeline/actions/read_api.py CHANGED Viewed

@@ -55,51 +55,75 @@ class ReadAPIAction(PipelineAction):
     DataFrame containing the response data.
     Example:
-    ```yaml
-    Read API:
-        action: READ_API
-        options:
-            base_url: https://some_url.com/api/
-            endpoint: my/endpoint/
-            method: GET
-            timeout: 90
-            auth:
-                - type: basic
-                  username: my_username
-                  password: my_password
-                - type: secret_scope
-                  secret_scope: my_secret_scope
-                  header_template:
-                    "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
-                - type: secret_scope
-                  secret_scope: my_secret_scope
-                  header_template:
-                    "header_key_2": "<SECRET_NAME>"
-                - type: secret_scope
-                  secret_scope: my_other_secret_scope
-                  header_template:
-                    "header_key_3": "<SECRET_NAME>"
-                - type: azure_oauth
-                  client_id: my_client_id
-                  client_secret: my_client_secret
-                  tenant_id: my_tenant_id
-                  scope: <entra-id-client-id>
-    ```
-    The above example will combine the headers from the different auth types. The resulting header will look like this:
-    ```json
-    {
-        "header_key_1": "value_from_environment_variable",
-        "header_key_2": "value_from_secret",
-        "header_key_3": "value_from_secret",
-        "Authorization": "Bearer <access_token> (from azure_oauth)",
-        "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
-    }
-    ```
-    !!! warning
+        === "Basic Usage"
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: my/endpoint/
+            ```
+        === "Usage with Parameters and Headers"
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: my/endpoint/
+                    method: GET
+                    timeout: 90
+                    headers:
+                        key1: value1
+                        key2: value2
+                    params:
+                        key1: value1
+                        key2: value2
+            ```
+        === "Usage with Authentication"
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: my/endpoint/
+                    method: GET
+                    timeout: 90
+                    auth:
+                        - type: basic
+                          username: my_username
+                          password: my_password
+                        - type: secret_scope
+                          secret_scope: my_secret_scope
+                          header_template:
+                            "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
+                        - type: secret_scope
+                          secret_scope: my_secret_scope
+                          header_template:
+                            "header_key_2": "<SECRET_NAME>"
+                        - type: secret_scope
+                          secret_scope: my_other_secret_scope
+                          header_template:
+                            "header_key_3": "<SECRET_NAME>"
+                        - type: azure_oauth
+                          client_id: my_client_id
+                          client_secret: my_client_secret
+                          tenant_id: my_tenant_id
+                          scope: <entra-id-client-id>
+            ```
+            The above example will combine the headers from the different auth types. The resulting header will look like this:
+            ```json
+            {
+                "header_key_1": "value_from_environment_variable",
+                "header_key_2": "value_from_secret",
+                "header_key_3": "value_from_secret",
+                "Authorization": "Bearer <access_token> (from azure_oauth)",
+                "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
+            }
+            ```
+    !!! warning "Secret information"
         Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
         Use secret scopes or environment variables instead.
     """

cloe_nessy/pipeline/actions/read_catalog_table.py CHANGED Viewed

@@ -15,13 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
     into a DataFrame and returned as part of an updated `PipelineContext`.
     Example:
-    ```yaml
-    Read Sales Table:
-        action: READ_CATALOG_TABLE
-        options:
-            table_identifier: my_catalog.business_schema.sales_table
-            options: <options for the reader>
-    ```
+        ```yaml
+        Read Sales Table:
+            action: READ_CATALOG_TABLE
+            options:
+                table_identifier: my_catalog.business_schema.sales_table
+                options: <options for the CatalogReader read method>
+        ```
     """
     name: str = "READ_CATALOG_TABLE"
@@ -43,8 +43,8 @@ class ReadCatalogTableAction(PipelineAction):
                 read. If not provided, the function will attempt to use the table
                 identifier from the `table_metadata` in the `context`.
             options: A dictionary of options for customizing
-                the catalog reader's behavior, such as filters or reading modes. Defaults
-                to None.
+                the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
+                behavior, such as filters or reading modes. Defaults to None.
         Raises:
             ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.

cloe_nessy/pipeline/actions/read_excel.py CHANGED Viewed

@@ -21,16 +21,20 @@ class ReadExcelAction(PipelineAction):
     the read files can be included in the context.
     Example:
-    ```yaml
-    Read Excel Table:
-        action: READ_EXCEL
-        options:
-            file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
-            usecols:
-                - key_column
-                - interesting_column
-            options: <more options for the reader>
-    ```
+        ```yaml
+        Read Excel Table:
+            action: READ_EXCEL
+            options:
+                file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
+                usecols:
+                    - key_column
+                    - interesting_column
+                options: <options for the ExcelDataFrameReader read method>
+        ```
+    !!! note "More Options"
+        The `READ_EXCEL` action supports additional options that can be passed to the
+        run method. For more information, refer to the method documentation.
     """
     name: str = "READ_EXCEL"

cloe_nessy/pipeline/actions/read_files.py CHANGED Viewed

@@ -14,14 +14,47 @@ class ReadFilesAction(PipelineAction):
     location will be read using a DataFrameReader with the specified format.
     Example:
-    ```yaml
-    Read Excel Table:
-        action: READ_FILES
-        options:
-            location: excel_file_folder/excel_files_june/
-            search_subdirs: True
-            spark_format: AVRO
-    ```
+        === "Read files specified by spark_format"
+            ```yaml
+            Read Files:
+                action: READ_FILES
+                options:
+                    location: json_file_folder/
+                    search_subdirs: True
+                    spark_format: JSON
+            ```
+            !!! note "Define Spark Format"
+                Use the `spark_format` option to specify the format with which
+                to read the files. Supported formats are e.g., `CSV`, `JSON`,
+                `PARQUET`, `TEXT`, and `XML`.
+        === "Read files specified by extension"
+            ```yaml
+            Read Files:
+                action: READ_FILES
+                options:
+                    location: csv_file_folder/
+                    search_subdirs: True
+                    extension: csv
+            ```
+            !!! note "Define Extension"
+                Use the `extension` option to specify the extension of the files
+                to read. If not specified, the `spark_format` will be derived from
+                the extension.
+        === "Read files with a specified spark_format AND extension"
+            ```yaml
+            Read Files:
+                action: READ_FILES
+                options:
+                    location: file_folder/
+                    extension: abc_custom_extension  # specifies the files to read
+                    spark_format: CSV  # specifies the format to read the files with
+            ```
+            !!! note "Define both Extension & Spark Format"
+                Use the `extension` option to specify the extension of the files
+                to read. Additionally, use the `spark_format` option to specify
+                the format with which to read the files.
     """
     name: str = "READ_FILES"
@@ -47,7 +80,8 @@ class ReadFilesAction(PipelineAction):
             search_subdirs: Recursively search subdirectories for files
                 if an extension is provided.
             extension: The file extension to filter files by.
-            spark_format: The format to use for reading the files.
+            spark_format: The format to use for reading the files. If not provided,
+                it will be deferred from the file extension.
             schema: The schema of the data. If None, schema is obtained from
                 the context metadata.
             add_metadata_column: Whether to include the `__metadata` column with
@@ -65,30 +99,22 @@ class ReadFilesAction(PipelineAction):
             raise ValueError("No location provided. Please specify location to read files from.")
         if not options:
             options = dict()
+        if not spark_format and not extension:
+            raise ValueError("Either spark_format or extension must be provided.")
         if (metadata := context.table_metadata) and schema is None:
             schema = metadata.schema
         file_reader = FileReader()
-        if extension:
-            df = file_reader.read(
-                location=location,
-                schema=schema,
-                extension=extension,
-                search_subdirs=search_subdirs,
-                options=options,
-                add_metadata_column=add_metadata_column,
-            )
-        elif spark_format:
-            df = file_reader.read(
-                location=location,
-                schema=schema,
-                spark_format=spark_format,
-                options=options,
-                add_metadata_column=add_metadata_column,
-            )
-        else:
-            raise ValueError("Please provide either the 'extension' or 'spark_format'")
+        df = file_reader.read(
+            location=location,
+            schema=schema,
+            extension=extension,
+            spark_format=spark_format,
+            search_subdirs=search_subdirs,
+            options=options,
+            add_metadata_column=add_metadata_column,
+        )
         runtime_info = context.runtime_info

cloe_nessy/pipeline/actions/read_metadata_yaml.py CHANGED Viewed

@@ -10,14 +10,14 @@ class ReadMetadataYAMLAction(PipelineAction):
     """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
     Example:
-    ```yaml
-    Read Schema Metadata:
-        action: READ_METADATA_YAML_ACTION
-        options:
-            path: excel_file_folder/excel_files_june/
-            file_name: sales_schema.yml
-            table_name: sales
-    ```
+        ```yaml
+        Read Schema Metadata:
+            action: READ_METADATA_YAML_ACTION
+            options:
+                path: excel_file_folder/excel_files_june/
+                file_name: sales_schema.yml
+                table_name: sales
+        ```
     """
     name: str = "READ_METADATA_YAML_ACTION"
@@ -31,7 +31,7 @@ class ReadMetadataYAMLAction(PipelineAction):
         table_name: str | None = None,
         **_: Any,
     ) -> PipelineContext:
-        """Reads schema metadata from a yaml file using the `Schema` model.
+        """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
         Args:
             context: The context in which this Action is executed.

cloe_nessy/pipeline/actions/transform_change_datatype.py CHANGED Viewed

@@ -9,15 +9,20 @@ from ..pipeline_context import PipelineContext
 class TransformChangeDatatypeAction(PipelineAction):
     """Changes the datatypes of specified columns in the given DataFrame.
+    !!! note "Data Types"
+        We make use of the PySpark `cast` function to change the data types of
+        the columns. Valid data types can be found in the [PySpark
+        documentation](https://spark.apache.org/docs/3.5.3/sql-ref-datatypes.html).
     Example:
-    ```yaml
-    Transform Columns:
-        action: TRANSFORM_CHANGE_DATATYPE
-        options:
-            columns:
-                id: string
-                revenue: long
-    ```
+        ```yaml
+        Cast Columns:
+            action: TRANSFORM_CHANGE_DATATYPE
+            options:
+                columns:
+                    id: string
+                    revenue: long
+        ```
     """
     name: str = "TRANSFORM_CHANGE_DATATYPE"

cloe_nessy/pipeline/actions/transform_clean_column_names.py ADDED Viewed

@@ -0,0 +1,81 @@
+import json
+import re
+from typing import Any
+import pyspark.sql.functions as F
+import pyspark.sql.types as T
+from ..pipeline_action import PipelineAction
+from ..pipeline_context import PipelineContext
+class TransformCleanColumnNamesAction(PipelineAction):
+    """Fixes column names in the DataFrame to be valid.
+    Removes invalid characters from the column names, including the fields of a struct and
+    replaces a single leading underscore by a double underscore.
+    Invalid characters include:
+        - Any non-word character (anything other than letters, digits, and underscores).
+        - A single leading underscore.
+    Example:
+        ```yaml
+        Clean Column Names:
+            action: TRANSFORM_CLEAN_COLUMN_NAMES
+        ```
+    """
+    name: str = "TRANSFORM_CLEAN_COLUMN_NAMES"
+    def run(
+        self,
+        context: PipelineContext,
+        **_: Any,
+    ) -> PipelineContext:
+        """Fixes column names in the DataFrame to be valid.
+        Removes invalid characters from the column names, including the fields of a struct and
+        replaces a single leading underscore by a double underscore.
+        Args:
+            context: The context in which this Action is executed.
+        Raises:
+            ValueError: If the data from the context is None.
+        Returns:
+            The context after the execution of this Action, containing the DataFrame with cleaned column names.
+        """
+        if context.data is None:
+            raise ValueError("Data from the context is required for the operation.")
+        with_columns_renamed = {}
+        with_columns_casted: dict[str, T.StructType | T.ArrayType | T.MapType] = {}
+        single_underscrore_at_beginning = r"^_(?=[^_])"
+        for c in context.data.schema:
+            old_name = c.name
+            new_name = re.sub(single_underscrore_at_beginning, "__", re.sub("\W", "_", old_name))
+            with_columns_renamed[old_name] = new_name
+            if isinstance(c.dataType, (T.StructType | T.ArrayType | T.MapType)):
+                old_column_schema = c.dataType.json()
+                new_column_schema = re.sub(
+                    r'(?<="name":")[^"]+',
+                    lambda m: re.sub("\W", "_", str(m.group())),
+                    old_column_schema,
+                )
+                if isinstance(c.dataType, T.StructType):
+                    with_columns_casted[new_name] = T.StructType.fromJson(json.loads(new_column_schema))
+                elif isinstance(c.dataType, T.ArrayType):
+                    with_columns_casted[new_name] = T.ArrayType.fromJson(json.loads(new_column_schema))
+                elif isinstance(c.dataType, T.MapType):
+                    with_columns_casted[new_name] = T.MapType.fromJson(json.loads(new_column_schema))
+        df = context.data.withColumnsRenamed(with_columns_renamed)
+        for c_name, c_type in with_columns_casted.items():
+            df = df.withColumn(c_name, F.col(c_name).cast(c_type))
+        return context.from_existing(data=df)  # type: ignore

cloe_nessy/pipeline/actions/transform_concat_columns.py CHANGED Viewed

@@ -10,17 +10,31 @@ class TransformConcatColumnsAction(PipelineAction):
     """Concatenates the specified columns in the given DataFrame.
     Example:
-    ```yaml
-    Concat Columns:
-        action: TRANSFORM_CONCAT_COLUMNS
-        options:
-            name: address
-            columns:
-                - street
-                - postcode
-                - country
-            separator: ', '
-    ```
+        === "concat with separator"
+            ```yaml
+            Concat Columns:
+                action: TRANSFORM_CONCAT_COLUMNS
+                options:
+                    name: address
+                    columns:
+                        - street
+                        - postcode
+                        - country
+                    separator: ', '
+            ```
+        === "concat without separator"
+            ```yaml
+            Concat Column:
+                action: TRANSFORM_CONCAT_COLUMNS
+                options:
+                    name: address
+                    columns:
+                        - street
+                        - postcode
+                        - country
+            ```
+            !!! warning "beware of null handling"
+                The `separator` option is not provided, so the default behavior is to use `concat` which returns `NULL` if any of the concatenated values is `NULL`.
     """
     name: str = "TRANSFORM_CONCAT_COLUMNS"

cloe_nessy/pipeline/actions/transform_decode.py CHANGED Viewed

@@ -11,13 +11,24 @@ class TransformDecodeAction(PipelineAction):
     """Decodes values of a specified column in the DataFrame based on the given format.
     Example:
-    ```yaml
-    Decode Columns:
-        action: TRANSFORM_DECODE
-        options:
-            column: configurations
-            input_format: json
-    ```
+        === "Decode JSON column"
+            ```yaml
+            Expand JSON:
+                action: "TRANSFORM_DECODE"
+                options:
+                    column: "data"
+                    input_format: "json"
+                    schema: "quality INT, timestamp TIMESTAMP, value DOUBLE"
+            ```
+        === "Decode base64 column"
+            ```yaml
+            Decode base64:
+                action: TRANSFORM_DECODE
+                options:
+                    column: encoded_data
+                    input_format: base64
+                    schema: string
+            ```
     """
     name: str = "TRANSFORM_DECODE"

cloe_nessy/pipeline/actions/transform_deduplication.py CHANGED Viewed

@@ -18,15 +18,15 @@ class TransformDeduplication(PipelineAction):
     (can be changed to lowest by setting the parameter descending to false).
     Example:
-    ```yaml
-    Deduplicate Columns:
-        action: TRANSFORM_DEDUPLICATION
-        options:
-            key_columns:
-                - id
-            order_by_columns:
-                - source_file_modification_time
-    ```
+        ```yaml
+        Deduplicate Columns:
+            action: TRANSFORM_DEDUPLICATION
+            options:
+                key_columns:
+                    - id
+                order_by_columns:
+                    - source_file_modification_time
+        ```
     """
     name: str = "TRANSFORM_DEDUPLICATION"

cloe_nessy/pipeline/actions/transform_distinct.py CHANGED Viewed

@@ -7,11 +7,17 @@ from ..pipeline_context import PipelineContext
 class TransformDistinctAction(PipelineAction):
     """Selects distinct rows from the DataFrame in the given context.
+    If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
     Example:
-    ```yaml
-    Decode Columns:
-        action: TRANSFORM_DISTINCT
-    ```
+        ```yaml
+        Distinct Columns:
+            action: TRANSFORM_DISTINCT
+            options:
+                subset:
+                    - first_name
+                    - last_name
+        ```
     """
     name: str = "TRANSFORM_DISTINCT"
@@ -19,12 +25,15 @@ class TransformDistinctAction(PipelineAction):
     def run(
         self,
         context: PipelineContext,
+        *,
+        subset: list[str] | None = None,
         **_: Any,
     ) -> PipelineContext:
         """Selects distinct rows from the DataFrame in the given context.
         Args:
             context: The context in which this Action is executed.
+            subset: List of column names to use for duplicate comparison (default All columns).
         Raises:
             ValueError: If the data from the context is None.
@@ -35,6 +44,14 @@ class TransformDistinctAction(PipelineAction):
         if context.data is None:
             raise ValueError("Data from the context is required for the operation.")
-        df = context.data.distinct()
+        # check if all columns that are part of the subset are actually part of the dataframe.
+        if subset is not None:
+            subset_columns_not_in_dataframe = set(subset) - set(context.data.columns)
+            if len(subset_columns_not_in_dataframe) != 0:
+                raise ValueError(
+                    f"The following subset columns are not part of the dataframe: {subset_columns_not_in_dataframe}"
+                )
+        df = context.data.dropDuplicates(subset=subset)
         return context.from_existing(data=df)  # type: ignore

cloe_nessy/pipeline/actions/transform_filter.py CHANGED Viewed

@@ -8,12 +8,12 @@ class TransformFilterAction(PipelineAction):
     """Filters the DataFrame in the given context based on a specified condition.
     Example:
-    ```yaml
-    Decode Columns:
-        action: TRANSFORM_FILTER
-        options:
-            condition: where city="Hamburg"
-    ```
+        ```yaml
+        Filter Columns:
+            action: TRANSFORM_FILTER
+            options:
+                condition: city="Hamburg"
+        ```
     """
     name: str = "TRANSFORM_FILTER"

cloe_nessy/pipeline/actions/transform_generic_sql.py CHANGED Viewed

@@ -13,12 +13,18 @@ class TransformSqlAction(PipelineAction):
     statement is executed on that view. The resulting DataFrame is returned.
     Example:
-    ```yaml
-    SQL Transform:
-        action: TRANSFORM_SQL
-        options:
-            sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
-    ```
+        ```yaml
+        SQL Transform:
+            action: TRANSFORM_SQL
+            options:
+                sql_statement: select city, revenue, firm from {DATA_FRAME} where product="Databricks"
+        ```
+        !!! note
+            The SQL statement should reference the DataFrame as "{DATA_FRAME}".
+            This nessy specific placeholder will be replaced with your input
+            DataFrame from the context. If your pipeline is defined as an
+            f-string, you can escape the curly braces by doubling them, e.g.,
+            "{{DATA_FRAME}}".
     """
     name: str = "TRANSFORM_SQL"

cloe_nessy/pipeline/actions/transform_group_aggregate.py CHANGED Viewed

@@ -13,33 +13,27 @@ class TransformGroupAggregate(PipelineAction):
     to other columns. The aggregation functions can be specified as a dictionary where keys are column names
     and values are either a single aggregation function or a list of functions.
+    The output DataFrame will contain the grouped columns and the aggregated columns with the aggregation
+    function as a prefix to the column name.
     Example:
-    ```yaml
-    Transform Group Aggregate:
-        action: TRANSFORM_GROUP_AGGREGATE
-        options:
-            grouping_columns:
-                - column1
-                - column2
-            aggregations:
-                column3:
-                    - sum
-                    - avg
-                column4: max
-    ```
-    Attributes:
-        name (str): The name of the action, default is "TRANSFORM_GROUP_AGGREGATE".
-    Methods:
-        run(context, grouping_columns=None, aggregations=None, **_):
-            Executes the aggregation on the grouped data.
-    Raises:
-        ValueError: If the context data is None.
-        ValueError: If no aggregations are provided.
-        ValueError: If invalid aggregation operations are provided.
-        ValueError: If columns with unsupported data types are included in the aggregations.
+        ```yaml
+        Transform Group Aggregate:
+            action: TRANSFORM_GROUP_AGGREGATE
+            options:
+                grouping_columns:
+                    - column1
+                    - column2
+                aggregations:
+                    column3:
+                        - sum
+                        - avg
+                    column4: max
+        ```
+        This example groups the DataFrame by `column1` and `column2` and aggregates `column3` by sum and average
+        and `column4` by max. The resulting DataFrame will contain the grouped columns `column1` and `column2`
+        and the aggregated columns `sum_column3`, `avg_column3`, and `max_column4`.
     """
     name: str = "TRANSFORM_GROUP_AGGREGATE"

cloe_nessy/pipeline/actions/transform_join.py CHANGED Viewed

@@ -8,18 +8,25 @@ from ..pipeline_step import PipelineStep
 class TransformJoinAction(PipelineAction):
     """Joins the current DataFrame with another DataFrame defined in joined_data.
-    The join operation is performed based on specified columns and the type of join
-    indicated by the `how` parameter.
+    The join operation is performed based on specified columns and the type of
+    join indicated by the `how` parameter. Supported join types can be taken
+    from [PySpark
+    documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
     Example:
-    ```yaml
-    Join Tables:
-        action: TRANSFORM_JOIN
-        options:
-            joined_data: ((step:Transform First Table))
-            join_on: id
-            how: anti
-    ```
+        ```yaml
+        Join Tables:
+            action: TRANSFORM_JOIN
+            options:
+                joined_data: ((step:Transform First Table))
+                join_on: id
+                how: anti
+        ```
+        !!! note "Referencing a DataFrame from another step"
+            The `joined_data` parameter is a reference to the DataFrame from another step.
+            The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
+            for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
     """
     name: str = "TRANSFORM_JOIN"

cloe_nessy/pipeline/actions/transform_json_normalize.py CHANGED Viewed

@@ -14,12 +14,25 @@ class TransformJsonNormalize(PipelineAction):
     structs are appended after existing columns.
     Example:
-    ```yaml
-    Normalize Tables:
-        action: TRANSFORM_JSON_NORMALIZE
-        options:
-            exclude_columns: coordinates
-    ```
+        ```yaml
+        Normalize Tables:
+            action: TRANSFORM_JSON_NORMALIZE
+            options:
+                exclude_columns: coordinates
+        ```
+        Example Input Data:
+        | id | name   | coordinates          | attributes                |
+        |----|--------|----------------------|---------------------------|
+        | 1  | Alice  | [10.0, 20.0]         | {"age": 30, "city": "NY"} |
+        | 2  | Bob    | [30.0, 40.0]         | {"age": 25, "city": "LA"} |
+        Example Output Data:
+        | id | name   | coordinates | attributes_age | attributes_city |
+        |----|--------|-------------|----------------|-----------------|
+        | 1  | Alice  | [10.0, 20.0]| 30             | NY              |
+        | 2  | Bob    | [30.0, 40.0]| 25             | LA              |
     """
     name: str = "TRANSFORM_JSON_NORMALIZE"

cloe_nessy/pipeline/actions/transform_rename_columns.py CHANGED Viewed

@@ -12,13 +12,13 @@ class TransformRenameColumnsAction(PipelineAction):
     name and its corresponding value represents the new column name.
     Example:
-    ```yaml
-    Rename Column:
-        action: TRANSFORM_RENAME_COLUMNS
-        options:
-            columns:
-                a_very_long_column_name: shortname
-    ```
+        ```yaml
+        Rename Column:
+            action: TRANSFORM_RENAME_COLUMNS
+            options:
+                columns:
+                    a_very_long_column_name: shortname
+        ```
     """
     name: str = "TRANSFORM_RENAME_COLUMNS"

cloe_nessy/pipeline/actions/transform_replace_values.py CHANGED Viewed

@@ -13,14 +13,14 @@ class TransformReplaceValuesAction(PipelineAction):
     in the specified columns.
     Example:
-    ```yaml
-    Replace Values:
-        action: TRANSFORM_REPLACE_VALUES
-        options:
-            replace:
-                empl_function:
-                    sales_employee: seller
-    ```
+        ```yaml
+        Replace Values:
+            action: TRANSFORM_REPLACE_VALUES
+            options:
+                replace:
+                    empl_function:
+                        sales_employee: seller
+        ```
     """
     name: str = "TRANSFORM_REPLACE_VALUES"

cloe_nessy/pipeline/actions/transform_select_columns.py CHANGED Viewed

@@ -14,15 +14,44 @@ class TransformSelectColumnsAction(PipelineAction):
     DataFrame before performing the selection.
     Example:
-    ```yaml
-    Select Columns:
-        action: TRANSFORM_SELECT_COLUMNS
-        options:
-            include_columns:
-                - id
-                - city
-                - product
-    ```
+        Example Input Data:
+        | id | name   | coordinates          | attributes                |
+        |----|--------|----------------------|---------------------------|
+        | 1  | Alice  | [10.0, 20.0]         | {"age": 30, "city": "NY"} |
+        | 2  | Bob    | [30.0, 40.0]         | {"age": 25, "city": "LA"} |
+        === "Include Columns"
+            ```yaml
+            Select Columns:
+                action: TRANSFORM_SELECT_COLUMNS
+                options:
+                    include_columns:
+                        - id
+                        - name
+                        - coordinates
+            ```
+            Example Output Data:
+            | id | name   | coordinates          |
+            |----|--------|----------------------|
+            | 1  | Alice  | [10.0, 20.0]         |
+            | 2  | Bob    | [30.0, 40.0]         |
+        === "Exclude Columns"
+            ```yaml
+            Select Columns:
+                action: TRANSFORM_SELECT_COLUMNS
+                options:
+                    exclude_columns:
+                        - coordinates
+            ```
+            Example Output Data:
+            | id | name   | attributes                |
+            |----|--------|---------------------------|
+            | 1  | Alice  | {"age": 30, "city": "NY"} |
+            | 2  | Bob    | {"age": 25, "city": "LA"} |
     """
     name: str = "TRANSFORM_SELECT_COLUMNS"

cloe_nessy/pipeline/actions/transform_union.py CHANGED Viewed

@@ -17,14 +17,18 @@ class TransformUnionAction(PipelineAction):
     empty, a ValueError will be raised.
     Example:
-    ```yaml
-    Union Tables:
-        action: TRANSFORM_UNION
-        options:
-            union_data:
-                - ((step: Filter First Table))
-                - ((step: SQL Transform Second Table))
-    ```
+        ```yaml
+        Union Tables:
+            action: TRANSFORM_UNION
+            options:
+                union_data:
+                    - ((step: Filter First Table))
+                    - ((step: SQL Transform Second Table))
+        ```
+        !!! note "Referencing a DataFrame from another step"
+            The `union_data` parameter is a reference to the DataFrame from another step.
+            The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
+            for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
     """
     name: str = "TRANSFORM_UNION"

cloe_nessy/pipeline/actions/write_catalog_table.py CHANGED Viewed

@@ -9,15 +9,16 @@ class WriteCatalogTableAction(PipelineAction):
     """Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
     Example:
-    ```yaml
-    Write Table to Catalog:
-        action: WRITE_CATALOG_TABLE
-        options:
-            table_identifier: my_catalog.business_schema.sales_table
-            mode: append
-            partition_by: day
-            options: <options for the writer>
-    ```
+        ```yaml
+        Write Table to Catalog:
+            action: WRITE_CATALOG_TABLE
+            options:
+                table_identifier: my_catalog.business_schema.sales_table
+                mode: append
+                partition_by: day
+                options:
+                    mergeSchema: true
+        ```
     """
     name: str = "WRITE_CATALOG_TABLE"
@@ -42,7 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
             mode: The write mode. One of 'append', 'overwrite', 'error',
                 'errorifexists', or 'ignore'.
             partition_by: Names of the partitioning columns.
-            options: Additional options for the write operation.
+            options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
         Raises:
             ValueError: If the table name is not specified or cannot be inferred from

cloe_nessy/session/session_manager.py CHANGED Viewed

@@ -63,13 +63,13 @@ class SessionManager:
     @classmethod
     def get_utils(
         cls,
-    ) -> Any:  # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
-        """Get or create a DBUtils or MsSparkUtils instance, depending on the context.
+    ) -> Any:  # return type should be Union[DBUtils, MsSparkUtils, RemoteDbUtils].
+        """Get or create a DBUtils, RemoteDbUtils or MsSparkUtils instance, depending on the context.
-        In Databricks this will return DBUtils, while in Fabric it will return MsSparkUtils.
+        In Databricks this will return DBUtils, when using Databricks-Connect it returns RemoteDbUtils, and in Fabric it will return MsSparkUtils.
         Returns:
-            utils: The DBUtils or MsSparkUtils instance.
+            utils: The DBUtils, RemoteDbUtils or MsSparkUtils instance.
         Raises:
             RuntimeError: If the instance cannot be created.
@@ -88,19 +88,25 @@ class SessionManager:
         }
         try:
-            cls._utils = utils_function[cls._env](cls)  # type: ignore
+            cls._utils = utils_function[cls._env]()  # type: ignore
         except Exception as e:
             raise RuntimeError(f"Cannot create utils instance. Error: {e}") from e
         return cls._utils
+    @classmethod
     def _get_dbutils(cls):
+        if cls._env == cls.Environment.DATABRICKS_CONNECT:
+            from databricks.sdk import WorkspaceClient
+            return WorkspaceClient().dbutils
         from pyspark.dbutils import DBUtils
         cls.get_spark_session()
-        utils = DBUtils(cls._spark)
-        return utils
+        return DBUtils(cls._spark)
+    @classmethod
     def _get_mssparkutils(cls):
         from notebookutils import mssparkutils  # type: ignore

{cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: cloe-nessy
-Version: 0.3.2
+Version: 0.3.5
 Summary: Your friendly datalake monster.
 Home-page: https://initions.com/
 Author: initions

{cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/RECORD RENAMED Viewed

@@ -22,10 +22,10 @@ cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFM
 cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
 cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
 cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
-cloe_nessy/integration/reader/file_reader.py,sha256=pkrW_N5avqQpqcZuIQgHw5CFf7DFpSuKvq88zPZPfyY,3879
+cloe_nessy/integration/reader/file_reader.py,sha256=1os8pZIXAGTJBZjGREmHOTlZeabbikC7sDv5xn3bIjE,3950
 cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
 cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
-cloe_nessy/integration/writer/catalog_writer.py,sha256=49lDvYttUY79Ye_OMN2cji7lGJNNML4TTsjY7VvLVfc,2137
+cloe_nessy/integration/writer/catalog_writer.py,sha256=Gb-hMdADgO_uUJ7mZPHBYyNme2qXsdFFnzwo7GcShHM,2192
 cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
 cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
 cloe_nessy/models/__init__.py,sha256=_JPN_R5-QDfjYzvrvZDdeOezl0C-JTG-Rk4S1VE5vJM,242
@@ -47,34 +47,35 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
 cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
 cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
 cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
-cloe_nessy/pipeline/actions/__init__.py,sha256=Psksv49DVhWHR2D1OuMxvYClF1Vjh5shiyy9yBdWnb0,2160
-cloe_nessy/pipeline/actions/read_api.py,sha256=wGyPZdeh3Cam_BQBilltWBWCIdD9I_kv4lunEhE39Tg,6625
-cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=aZy4sJLLE8ZQ_SPXGSDoHYaBJTz8s7xQDVn5eYrYHvE,2689
-cloe_nessy/pipeline/actions/read_excel.py,sha256=EgHbK1wO6dkDo0KErYDhK_2sNIkIoa-6As9oo9dNFsE,7708
-cloe_nessy/pipeline/actions/read_files.py,sha256=8twjprqKYEmVu5QITEGe4no45TfhgzZosTFVQ89vV6g,3861
-cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=aZtkstf9jBYYN2MGnazz63BG_hJ7mIgAfKiNqUpc26E,2235
-cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=Nz3Ncr-Zd-wy8g9-aN5XcvpWAHLyWs70RpZ7KqKqIaU,1788
-cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=V0TzeQFpBYur_T1Nv0nRpOU02nKQ2iypo2CCcV2rBtk,3083
-cloe_nessy/pipeline/actions/transform_decode.py,sha256=DmT-29dIqbz_xTj4GSCfnbgYRCiUrWzKvGrRYy1frNw,4004
-cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=2VN5_wza7sD7fERyG6ElGh_Yo-W-Mxw-QBmtDXs1MGQ,5063
-cloe_nessy/pipeline/actions/transform_distinct.py,sha256=sdCElXCM77AQ0m6Zzg_h7cyavBOxo7W9K1NrsvNLufA,1105
-cloe_nessy/pipeline/actions/transform_filter.py,sha256=vOAxKtNWCABLb6G6Xz98NK7fEfgn6QJia31S7IvoUTg,1428
-cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=cli59HCERFge7f0RB8yXw2oDtHSbMCWQMdeCeqhbdg8,2355
-cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=HcY4sqb2yNBCz90jQtxGA8fZPuQXfJuaDmv8lWuoTqg,4050
-cloe_nessy/pipeline/actions/transform_join.py,sha256=qktyaN2kcCkmoH3RILTc-UGYsGACx1nXH6xLtuvYi7k,3080
-cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=xN_cQgHSMSyPsyYXBdoe2i5pHnyH-kkH5do8qr3vybw,4157
-cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=fFdg3353QCE3zBei6iYQW9huPBcQ906sJLioaOUWj3s,1924
-cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=-uOAbHkQZ2X23GB15W4-miAoHzyFH9hJyc6Y_5PA0w8,2017
-cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBEX-B-elKCvNPRU9ERSWs9afMGO8,3369
-cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
-cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
+cloe_nessy/pipeline/actions/__init__.py,sha256=LwKctXy4Jun52BnCVGvWa8nnKVjTSov4GT58j6Zy8zg,2273
+cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nuAHCuSaGs2s,7778
+cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=-k2wezkv8bE_xwoW7WM1ORhrCXQagKTUuXkhI2ZEROs,2783
+cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
+cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
+cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
+cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
+cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
+cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
+cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
+cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
+cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
+cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1QeatjdEis0up4I7cOWBdyo,1446
+cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
+cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
+cloe_nessy/pipeline/actions/transform_join.py,sha256=e_tvMk8YJTAWcUK_EmOgNt0s31ICZoMX_MKOTWx4lBY,3645
+cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
+cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
+cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
+cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
+cloe_nessy/pipeline/actions/transform_union.py,sha256=s81Vge0AbYPc7VkskCYfOQ_LEjqcmfNFyDkytfjcZyo,2720
+cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
 cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
-cloe_nessy/session/session_manager.py,sha256=rd33lSafzomuyGf1BzhyjIWuy9sXgFjr-ca7A7Sw8eo,6490
+cloe_nessy/session/session_manager.py,sha256=PK7awMc6fmot7f9FMmvIUbIzKFgjcy2o2bZS9kjVs10,6733
 cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
 cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
 cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
-cloe_nessy-0.3.2.dist-info/METADATA,sha256=7w0f9JC9rm0tmEBYvkPSuTPTKIoGaHFytY7eYf1GRkU,1837
-cloe_nessy-0.3.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-cloe_nessy-0.3.2.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
-cloe_nessy-0.3.2.dist-info/RECORD,,
+cloe_nessy-0.3.5.dist-info/METADATA,sha256=UUx3aIUgvCLn7j3H4DbCL1k9-47HPKaANiMQsUj66wo,1837
+cloe_nessy-0.3.5.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+cloe_nessy-0.3.5.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
+cloe_nessy-0.3.5.dist-info/RECORD,,

{cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

cloe-nessy 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl

cloe-nessy 0.3.2py3-none-any.whl → 0.3.5py3-none-any.whl