PyPI - cloe-nessy - Versions diffs - 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

cloe-nessy 0.3.17.0py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cloe_nessy/clients/api_client/__init__.py +10 -1
cloe_nessy/clients/api_client/api_client.py +19 -8
cloe_nessy/clients/api_client/api_response.py +7 -4
cloe_nessy/clients/api_client/pagination_config.py +84 -0
cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
cloe_nessy/integration/reader/__init__.py +2 -2
cloe_nessy/integration/reader/api_reader.py +463 -72
cloe_nessy/integration/reader/catalog_reader.py +49 -10
cloe_nessy/integration/reader/excel_reader.py +3 -3
cloe_nessy/integration/reader/file_reader.py +3 -1
cloe_nessy/integration/reader/reader.py +1 -1
cloe_nessy/integration/writer/catalog_writer.py +64 -2
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
cloe_nessy/models/column.py +3 -2
cloe_nessy/models/schema.py +1 -0
cloe_nessy/models/templates/create_table.sql.j2 +22 -0
cloe_nessy/object_manager/table_manager.py +29 -7
cloe_nessy/pipeline/actions/__init__.py +1 -1
cloe_nessy/pipeline/actions/read_api.py +272 -75
cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
cloe_nessy/pipeline/actions/read_excel.py +1 -1
cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
cloe_nessy/pipeline/actions/transform_decode.py +2 -1
cloe_nessy/pipeline/actions/transform_join.py +98 -24
cloe_nessy/pipeline/actions/transform_union.py +2 -2
cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
cloe_nessy/pipeline/pipeline_config.py +2 -0
cloe_nessy/pipeline/pipeline_context.py +1 -1
cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
cloe_nessy/pipeline/pipeline_step.py +2 -0
cloe_nessy/session/__init__.py +2 -1
cloe_nessy/session/pyspark_compat.py +15 -0
cloe_nessy/session/session_manager.py +1 -1
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1

cloe_nessy/pipeline/actions/read_metadata_yaml.py CHANGED Viewed

@@ -1,66 +1,94 @@
-import pathlib
+from pathlib import Path
 from typing import Any
-from ...models import Schema
+from ...models import Table
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
 class ReadMetadataYAMLAction(PipelineAction):
-    """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
+    """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
     Example:
-        ```yaml
-        Read Schema Metadata:
-            action: READ_METADATA_YAML_ACTION
-            options:
-                path: excel_file_folder/excel_files_june/
-                file_name: sales_schema.yml
-                table_name: sales
-        ```
+        === "Managed Table"
+            ```yaml
+            Read Table Metadata:
+                action: READ_METADATA_YAML_ACTION
+                options:
+                    file_path: metadata/schemas/bronze/sales_table.yml
+                    catalog_name: production
+                    schema_name: sales_data
+            ```
+        === "External Table"
+            ```yaml
+            Read Table Metadata:
+                action: READ_METADATA_YAML_ACTION
+                options:
+                    file_path: metadata/schemas/bronze/sales_table.yml
+                    catalog_name: production
+                    schema_name: sales_data
+                    storage_path: abfs://external_storage/sales_data/sales_table
+            ```
     """
     name: str = "READ_METADATA_YAML_ACTION"
-    @staticmethod
     def run(
+        self,
         context: PipelineContext,
         *,
-        path: str | None = None,
-        file_name: str | None = None,
-        table_name: str | None = None,
+        file_path: str | None = None,
+        catalog_name: str | None = None,
+        schema_name: str | None = None,
+        storage_path: str | None = None,
         **_: Any,
     ) -> PipelineContext:
-        """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
+        """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
         Args:
             context: The context in which this Action is executed.
-            path: The path to the data contract directory.
-            file_name: The name of the file that defines the schema.
-            table_name: The name of the table for which to retrieve metadata.
+            file_path: The path to the file that defines the table.
+            catalog_name: The name of the catalog for the table.
+            schema_name: The name of the schema for the table.
+            storage_path: The storage path for the table, if applicable. If not
+                provided, the table will be considered a managed table.
         Raises:
-            ValueError: If any issues occur while reading the schema, such as an invalid schema,
-                missing file, or missing path.
+            ValueError: If any issues occur while reading the table metadata, such as an invalid table,
+                missing file, missing path, or missing catalog/schema names.
         Returns:
             The context after the execution of this Action, containing the table metadata.
         """
-        if not path:
-            raise ValueError("No path provided. Please specify path to schema metadata.")
-        if not file_name:
-            raise ValueError("No file_name provided. Please specify file name.")
-        if not table_name:
-            raise ValueError("No table_name provided. Please specify table name.")
+        missing_params = []
+        if not file_path:
+            missing_params.append("file_path")
+        if not catalog_name:
+            missing_params.append("catalog_name")
+        if not schema_name:
+            missing_params.append("schema_name")
-        path_obj = pathlib.Path(path)
+        if missing_params:
+            raise ValueError(
+                f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
+            )
-        schema, errors = Schema.read_instance_from_file(path_obj / file_name)
+        final_file_path = Path(file_path) if file_path else Path()
+        table, errors = Table.read_instance_from_file(
+            final_file_path,
+            catalog_name=catalog_name,
+            schema_name=schema_name,
+        )
         if errors:
-            raise ValueError(f"Errors while reading schema metadata: {errors}")
-        if not schema:
-            raise ValueError("No schema found in metadata.")
+            raise ValueError(f"Errors while reading table metadata: {errors}")
+        if not table:
+            raise ValueError("No table found in metadata.")
-        table = schema.get_table_by_name(table_name=table_name)
+        if not table.storage_path and storage_path:
+            self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
+            table.storage_path = storage_path
+            table.is_external = True
+        self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
         return context.from_existing(table_metadata=table)

cloe_nessy/pipeline/actions/transform_decode.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from typing import Any
-from pyspark.sql import DataFrame
 from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
+from cloe_nessy.session import DataFrame
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext

cloe_nessy/pipeline/actions/transform_join.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import Any
+from pyspark.sql import functions as F
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
 from ..pipeline_step import PipelineStep
@@ -13,20 +15,74 @@ class TransformJoinAction(PipelineAction):
     from [PySpark
     documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
-    Example:
-        ```yaml
-        Join Tables:
-            action: TRANSFORM_JOIN
-            options:
-                joined_data: ((step:Transform First Table))
-                join_on: id
-                how: anti
-        ```
+    Examples:
+        === "Simple Column Join"
+            ```yaml
+            Join Tables:
+                action: TRANSFORM_JOIN
+                options:
+                    joined_data: ((step:Transform First Table))
+                    join_on: id
+                    how: inner
+            ```
+        === "Multiple Columns Join"
+            ```yaml
+            Join Tables:
+                action: TRANSFORM_JOIN
+                options:
+                    joined_data: ((step:Transform First Table))
+                    join_on: [customer_id, order_date]
+                    how: left
+            ```
+        === "Dictionary Join (Different Column Names)"
+            ```yaml
+            Join Tables:
+                action: TRANSFORM_JOIN
+                options:
+                    joined_data: ((step:Transform First Table))
+                    join_on:
+                        customer_id: cust_id
+                        order_date: date
+                    how: inner
+            ```
+        === "Complex Join with Literals and Expressions"
+            ```yaml
+            Join Tables:
+                action: TRANSFORM_JOIN
+                options:
+                    joined_data: ((step:Load Conditions Table))
+                    join_condition: |
+                        left.material = right.material
+                        AND right.sales_org = '10'
+                        AND right.distr_chan = '10'
+                        AND right.knart = 'ZUVP'
+                        AND right.lovmkond <> 'X'
+                        AND right.sales_unit = 'ST'
+                        AND left.calday BETWEEN
+                            to_date(right.date_from, 'yyyyMMdd') AND
+                            to_date(right.date_to, 'yyyyMMdd')
+                    how: left
+            ```
         !!! note "Referencing a DataFrame from another step"
             The `joined_data` parameter is a reference to the DataFrame from another step.
             The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
             for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
+        !!! tip "Dictionary Join Syntax"
+            When using a dictionary for `join_on`, the keys represent columns
+            from the DataFrame in context and the values represent columns from
+            the DataFrame in `joined_data`. This is useful when joining tables
+            with different column names for the same logical entity.
+        !!! tip "Complex Join Conditions"
+            Use `join_condition` instead of `join_on` for complex joins with literals,
+            expressions, and multiple conditions. Reference columns using `left.column_name`
+            for the main DataFrame and `right.column_name` for the joined DataFrame.
+            Supports all PySpark functions and operators.
     """
     name: str = "TRANSFORM_JOIN"
@@ -37,6 +93,7 @@ class TransformJoinAction(PipelineAction):
         *,
         joined_data: PipelineStep | None = None,
         join_on: list[str] | str | dict[str, str] | None = None,
+        join_condition: str | None = None,
         how: str = "inner",
         **_: Any,
     ) -> PipelineContext:
@@ -49,13 +106,17 @@ class TransformJoinAction(PipelineAction):
             join_on: A string for the join column
                 name, a list of column names, or a dictionary mapping columns from the
                 left DataFrame to the right DataFrame. This defines the condition for the
-                join operation.
+                join operation. Mutually exclusive with join_condition.
+            join_condition: A string containing a complex join expression with literals,
+                functions, and multiple conditions. Use 'left.' and 'right.' prefixes
+                to reference columns from respective DataFrames. Mutually exclusive with join_on.
             how: The type of join to perform. Must be one of: inner, cross, outer,
                 full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
         Raises:
             ValueError: If no joined_data is provided.
-            ValueError: If no join_on is provided.
+            ValueError: If neither join_on nor join_condition is provided.
+            ValueError: If both join_on and join_condition are provided.
             ValueError: If the data from context is None.
             ValueError: If the data from the joined_data is None.
@@ -64,8 +125,12 @@ class TransformJoinAction(PipelineAction):
         """
         if joined_data is None or joined_data.result is None or joined_data.result.data is None:
             raise ValueError("No joined_data provided.")
-        if not join_on:
-            raise ValueError("No join_on provided.")
+        if not join_on and not join_condition:
+            raise ValueError("Either join_on or join_condition must be provided.")
+        if join_on and join_condition:
+            raise ValueError("Cannot specify both join_on and join_condition. Use one or the other.")
         if context.data is None:
             raise ValueError("Data from the context is required for the operation.")
@@ -73,16 +138,25 @@ class TransformJoinAction(PipelineAction):
         df_right = joined_data.result.data.alias("right")  # type: ignore
         df_left = context.data.alias("left")  # type: ignore
-        if isinstance(join_on, str):
-            join_condition = [join_on]
-        elif isinstance(join_on, list):
-            join_condition = join_on
-        else:
-            join_condition = [
-                df_left[left_column] == df_right[right_column]  # type: ignore
-                for left_column, right_column in join_on.items()
-            ]
-        df = df_left.join(df_right, on=join_condition, how=how)  # type: ignore
+        if join_condition:
+            try:
+                condition = F.expr(join_condition)
+            except Exception as e:
+                # this will not raise an error in most cases, because the evaluation of the expression is lazy
+                raise ValueError(f"Failed to parse join condition '{join_condition}': {str(e)}") from e
+            df = df_left.join(df_right, on=condition, how=how)  # type: ignore
+        if join_on:
+            if isinstance(join_on, str):
+                join_condition_list = [join_on]
+            elif isinstance(join_on, list):
+                join_condition_list = join_on
+            else:
+                join_condition_list = [
+                    df_left[left_column] == df_right[right_column]  # type: ignore
+                    for left_column, right_column in join_on.items()
+                ]
+            df = df_left.join(df_right, on=join_condition_list, how=how)  # type: ignore
         return context.from_existing(data=df)  # type: ignore

cloe_nessy/pipeline/actions/transform_union.py CHANGED Viewed

@@ -22,8 +22,8 @@ class TransformUnionAction(PipelineAction):
             action: TRANSFORM_UNION
             options:
                 union_data:
-                    - ((step: Filter First Table))
-                    - ((step: SQL Transform Second Table))
+                    - ((step:Filter First Table))
+                    - ((step:SQL Transform Second Table))
         ```
         !!! note "Referencing a DataFrame from another step"
             The `union_data` parameter is a reference to the DataFrame from another step.

cloe_nessy/pipeline/actions/write_catalog_table.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any
 from ...integration.delta_loader import consume_delta_load
 from ...integration.writer import CatalogWriter
+from ...object_manager import TableManager
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
@@ -9,17 +10,31 @@ from ..pipeline_context import PipelineContext
 class WriteCatalogTableAction(PipelineAction):
     """Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
-    Example:
-        ```yaml
-        Write Table to Catalog:
-            action: WRITE_CATALOG_TABLE
-            options:
-                table_identifier: my_catalog.business_schema.sales_table
-                mode: append
-                partition_by: day
+    Examples:
+        === "Batch Write"
+            ```yaml
+            Write Table to Catalog:
+                action: WRITE_CATALOG_TABLE
                 options:
-                    mergeSchema: true
-        ```
+                    table_identifier: my_catalog.business_schema.sales_table
+                    mode: append
+                    partition_by: day
+                    options:
+                        mergeSchema: true
+            ```
+        === "Streaming Write"
+            ```yaml
+            Write Table to Catalog Stream:
+                action: WRITE_CATALOG_TABLE
+                options:
+                    table_identifier: my_catalog.business_schema.sales_table
+                    mode: append
+                    checkpoint_location: /path/to/checkpoint
+                    trigger_dict:
+                        processingTime: 10 seconds
+                    options:
+                        mergeSchema: true
+            ```
     """
     name: str = "WRITE_CATALOG_TABLE"
@@ -32,6 +47,9 @@ class WriteCatalogTableAction(PipelineAction):
         mode: str = "append",
         partition_by: str | list[str] | None = None,
         options: dict[str, str] | None = None,
+        checkpoint_location: str | None = None,
+        trigger_dict: dict | None = None,
+        await_termination: bool = False,
         **_: Any,
     ) -> PipelineContext:
         """Writes a DataFrame to a specified catalog table.
@@ -44,7 +62,11 @@ class WriteCatalogTableAction(PipelineAction):
             mode: The write mode. One of 'append', 'overwrite', 'error',
                 'errorifexists', or 'ignore'.
             partition_by: Names of the partitioning columns.
-            options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
+            checkpoint_location: Location for checkpointing.
+            trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
+            await_termination: If True, the function will wait for the streaming
+                query to finish before returning.
+            options: Additional options for the DataFrame write operation.
         Raises:
             ValueError: If the table name is not specified or cannot be inferred from
@@ -55,25 +77,48 @@ class WriteCatalogTableAction(PipelineAction):
         """
         if not options:
             options = dict()
-        if partition_by is None:
-            if hasattr(context.table_metadata, "partition_by"):
-                partition_by = context.table_metadata.partition_by  # type: ignore
+        streaming = context.runtime_info and context.runtime_info.get("streaming")
+        if streaming and not checkpoint_location:
+            raise ValueError("Checkpoint location must be specified for streaming writes.")
+        if (
+            partition_by is None
+            and context.table_metadata is not None
+            and hasattr(context.table_metadata, "partition_by")
+            and not context.table_metadata.liquid_clustering
+        ):
+            partition_by = context.table_metadata.partition_by  # type: ignore
         if (table_metadata := context.table_metadata) and table_identifier is None:
             table_identifier = table_metadata.identifier
         if table_identifier is None:
             raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
+        if table_metadata:
+            manager = TableManager()
+            manager.create_table(table=table_metadata, ignore_if_exists=True, replace=False)
         runtime_info = getattr(context, "runtime_info", None)
         if runtime_info and runtime_info.get("is_delta_load"):
             consume_delta_load(runtime_info)
         writer = CatalogWriter()
-        writer.write_table(
-            df=context.data,  # type: ignore
-            table_identifier=table_identifier,
-            mode=mode,
-            partition_by=partition_by,
-            options=options,
-        )
+        if streaming:
+            writer.write_stream(
+                df=context.data,  # type: ignore
+                table_identifier=table_identifier,
+                checkpoint_location=checkpoint_location,
+                trigger_dict=trigger_dict,
+                options=options,
+                mode=mode,
+                await_termination=await_termination,
+            )
+        else:
+            writer.write(
+                df=context.data,  # type: ignore
+                table_identifier=table_identifier,
+                mode=mode,
+                partition_by=partition_by,
+                options=options,
+            )
         return context.from_existing()

cloe_nessy/pipeline/actions/write_delta_merge.py CHANGED Viewed

@@ -117,6 +117,7 @@ class WriteDeltaMergeAction(PipelineAction):
         delta_merge_writer.write(
             table_identifier=context.table_metadata.identifier,
+            table=context.table_metadata,
             storage_path=str(context.table_metadata.storage_path),
             data_frame=context.data,
             key_columns=key_columns,

cloe_nessy/pipeline/pipeline_config.py CHANGED Viewed

@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
     context: str | None = None
     table_metadata: str | None = None
     options: dict = Field(default_factory=dict)
+    env: dict = Field(default_factory=dict)
 class PipelineConfig(PipelineConfigBaseModel):
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
     name: str
     steps: OrderedDict[str, PipelineStepConfig]
+    env: dict[str, str] = Field(default_factory=dict)

cloe_nessy/pipeline/pipeline_context.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any
-from pyspark.sql import DataFrame
+from cloe_nessy.session import DataFrame
 from ..models import Table

cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl

cloe-nessy 0.3.17.0py3-none-any.whl → 0.3.19py3-none-any.whl