PyPI - cloe-nessy - Versions diffs - 0.3.16.6b0__py3-none-any.whl → 0.3.17.0__py3-none-any.whl - Mend

cloe-nessy 0.3.16.6b0py3-none-any.whl → 0.3.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

cloe_nessy/models/table.py CHANGED Viewed

@@ -2,7 +2,6 @@ from pathlib import Path
 from typing import Any, Self
 import yaml
-import yaml.scanner
 from jinja2 import TemplateNotFound
 from pydantic import (
     Field,
@@ -11,6 +10,8 @@ from pydantic import (
     field_validator,
     model_validator,
 )
+from yaml.parser import ParserError
+from yaml.scanner import ScannerError
 from ..logging import LoggerMixin
 from ..utils.file_and_directory_handler import process_path
@@ -225,8 +226,8 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
                 errors += sub_errors
         except (
             ValidationError,
-            yaml.parser.ParserError,
-            yaml.scanner.ScannerError,
+            ParserError,
+            ScannerError,
         ) as e:
             instance = None
             errors.append(e)

cloe_nessy/object_manager/table_manager.py CHANGED Viewed

@@ -198,7 +198,9 @@ class TableManager(LoggerMixin):
             ValueError: If neither table nor location is provided, or if both are provided.
         """
         if (table is None and location is None) or (table is not None and location is not None):
-            raise ValueError("Either table or location must be provided, but not both.")
+            raise ValueError(
+                f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
+            )
         if table is not None:
             location = str(table.storage_path)

cloe_nessy/pipeline/actions/__init__.py CHANGED Viewed

@@ -9,7 +9,9 @@ from .read_metadata_yaml import ReadMetadataYAMLAction
 from .transform_change_datatype import TransformChangeDatatypeAction
 from .transform_clean_column_names import TransformCleanColumnNamesAction
 from .transform_concat_columns import TransformConcatColumnsAction
+from .transform_convert_timestamp import TransformConvertTimestampAction
 from .transform_decode import TransformDecodeAction
+from .transform_deduplication import TransformDeduplication
 from .transform_distinct import TransformDistinctAction
 from .transform_filter import TransformFilterAction
 from .transform_generic_sql import TransformSqlAction
@@ -45,7 +47,9 @@ __all__ = [
     "TransformChangeDatatypeAction",
     "TransformCleanColumnNamesAction",
     "TransformConcatColumnsAction",
+    "TransformConvertTimestampAction",
     "TransformDecodeAction",
+    "TransformDeduplication",
     "TransformDistinctAction",
     "TransformSqlAction",
     "TransformGroupAggregate",

cloe_nessy/pipeline/actions/read_catalog_table.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from typing import Any
+from ...integration.delta_loader import DeltaLoadOptions
 from ...integration.reader import CatalogReader
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
+from ..utils import set_delta_load_info
 class ReadCatalogTableAction(PipelineAction):
@@ -21,6 +23,12 @@ class ReadCatalogTableAction(PipelineAction):
             options:
                 table_identifier: my_catalog.business_schema.sales_table
                 options: <options for the CatalogReader read method>
+            delta_load_options:
+                strategy: CDF
+                delta_load_identifier: my_delta_load_id
+                strategy_options:
+                    deduplication_columns: ["id"]
+                    enable_full_load: true
         ```
     """
@@ -32,6 +40,7 @@ class ReadCatalogTableAction(PipelineAction):
         *,
         table_identifier: str | None = None,
         options: dict[str, str] | None = None,
+        delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
         **_: Any,  # define kwargs to match the base class signature
     ) -> PipelineContext:
         """Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
@@ -45,6 +54,8 @@ class ReadCatalogTableAction(PipelineAction):
             options: A dictionary of options for customizing
                 the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
                 behavior, such as filters or reading modes. Defaults to None.
+            delta_load_options: Options for delta loading, if applicable.
+                Configures the [`DeltaLoader`][cloe_nessy.integration.delta_loader].
         Raises:
             ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
@@ -53,13 +64,35 @@ class ReadCatalogTableAction(PipelineAction):
         An updated pipeline context containing the data read from the catalog table as a DataFrame.
         """
         if not options:
-            options = dict()
+            options = {}
+        if not delta_load_options:
+            delta_load_options = {}
         if (table_metadata := context.table_metadata) and table_identifier is None:
             table_identifier = table_metadata.identifier
         if table_identifier is None:
             raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
+        if isinstance(delta_load_options, dict):
+            delta_options_dict = delta_load_options
+            if delta_load_options:
+                delta_load_options = DeltaLoadOptions(**delta_load_options)
+            else:
+                delta_load_options = None
+        else:
+            delta_options_dict = delta_load_options.model_dump() if delta_load_options else {}
+        runtime_info = set_delta_load_info(
+            table_identifier=table_identifier,
+            delta_load_options=delta_options_dict,
+            runtime_info=context.runtime_info or {},
+        )
         table_reader = CatalogReader()
-        df = table_reader.read(table_identifier=table_identifier, options=options)
-        return context.from_existing(data=df)
+        df = table_reader.read(
+            table_identifier=table_identifier,
+            options=options,
+            delta_load_options=delta_load_options,
+        )
+        return context.from_existing(data=df, runtime_info=runtime_info)

cloe_nessy/pipeline/actions/read_files.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from typing import Any
+from ...integration.delta_loader import DeltaLoadOptions
 from ...integration.reader import FileReader
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
+from ..utils import set_delta_load_info
 class ReadFilesAction(PipelineAction):
@@ -55,6 +57,24 @@ class ReadFilesAction(PipelineAction):
                 Use the `extension` option to specify the extension of the files
                 to read. Additionally, use the `spark_format` option to specify
                 the format with which to read the files.
+        === "Read Delta Lake table with delta loading"
+            ```yaml
+            Read Delta Files:
+                action: READ_FILES
+                options:
+                    location: /path/to/delta/table
+                    spark_format: delta
+                delta_load_options:
+                    strategy: CDF
+                    delta_load_identifier: my_delta_files_load
+                    strategy_options:
+                        deduplication_columns: ["id"]
+                        enable_full_load: false
+            ```
+            !!! note "Delta Loading for Files"
+                Use `delta_load_options` when reading Delta Lake tables to enable
+                incremental loading. This works with both CDF and timestamp strategies.
     """
     name: str = "READ_FILES"
@@ -70,6 +90,7 @@ class ReadFilesAction(PipelineAction):
         schema: str | None = None,
         add_metadata_column: bool = True,
         options: dict[str, str] | None = None,
+        delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
         **_: Any,
     ) -> PipelineContext:
         """Reads files from a specified location.
@@ -87,6 +108,8 @@ class ReadFilesAction(PipelineAction):
             add_metadata_column: Whether to include the `__metadata` column with
                 file metadata in the DataFrame.
             options: Additional options passed to the reader.
+            delta_load_options: Options for delta loading, if applicable. When provided
+                for Delta format files, enables incremental loading using delta loader strategies.
         Raises:
             ValueError: If neither `extension` nor `spark_format` are provided, or if
@@ -105,6 +128,25 @@ class ReadFilesAction(PipelineAction):
         if (metadata := context.table_metadata) and schema is None:
             schema = metadata.schema
+        # Convert dict to DeltaLoadOptions if needed
+        if isinstance(delta_load_options, dict):
+            delta_load_options = DeltaLoadOptions(**delta_load_options)
+        # Set up runtime info for delta loading
+        runtime_info = context.runtime_info or {}
+        if delta_load_options:
+            # Convert DeltaLoadOptions to dict for runtime info storage
+            delta_options_dict = (
+                delta_load_options.model_dump()
+                if isinstance(delta_load_options, DeltaLoadOptions)
+                else delta_load_options
+            )
+            runtime_info = set_delta_load_info(
+                table_identifier=location,  # Use location as identifier for file-based delta loading
+                delta_load_options=delta_options_dict,
+                runtime_info=runtime_info,
+            )
         file_reader = FileReader()
         df = file_reader.read(
             location=location,
@@ -114,11 +156,11 @@ class ReadFilesAction(PipelineAction):
             search_subdirs=search_subdirs,
             options=options,
             add_metadata_column=add_metadata_column,
+            delta_load_options=delta_load_options,
         )
-        runtime_info = context.runtime_info
-        if add_metadata_column:
+        # Only process metadata column if it exists and wasn't using delta loading
+        if add_metadata_column and "__metadata" in df.columns:
             read_files_list = [x.file_path for x in df.select("__metadata.file_path").drop_duplicates().collect()]
             if runtime_info is None:
                 runtime_info = {"read_files": read_files_list}

cloe_nessy/pipeline/actions/transform_convert_timestamp.py ADDED Viewed

@@ -0,0 +1,97 @@
+from typing import Any
+from pyspark.errors.exceptions.connect import IllegalArgumentException
+from pyspark.sql import functions as F
+from pyspark.sql.utils import AnalysisException
+from ..pipeline_action import PipelineAction
+from ..pipeline_context import PipelineContext
+class TransformConvertTimestampAction(PipelineAction):
+    """This action performs timestamp based conversions.
+    Example:
+        ```yaml
+        Convert Timestamp:
+            action: TRANSFORM_CONVERT_TIMESTAMP
+            options:
+                columns:
+                    - date
+                    - creation_timestamp
+                    - current_ts
+                source_format: unixtime_ms
+                target_format: timestamp
+        ```
+    """
+    name: str = "TRANSFORM_CONVERT_TIMESTAMP"
+    def run(
+        self,
+        context: PipelineContext,
+        *,
+        columns: list[str] | str | None = None,
+        source_format: str = "",
+        target_format: str = "",
+        **_: Any,
+    ) -> PipelineContext:
+        """Converts column(s) from a given source format to a new format.
+        Args:
+            context: Context in which this Action is executed.
+            columns: A column name or a list of column names that should be converted.
+            source_format: Initial format type of the column.
+            target_format: Desired format type of the column.
+                This also supports passing a format string like `yyyy-MM-dd HH:mm:ss`.
+        Raises:
+            ValueError: If no column, source_format or target_format are provided.
+            ValueError: If source_format or target_format are not supported.
+        Returns:
+            PipelineContext: Context after the execution of this Action.
+        """
+        if not columns:
+            raise ValueError("No column names provided.")
+        if not source_format:
+            raise ValueError("No source_format provided.")
+        if not target_format:
+            raise ValueError("No target_format provided.")
+        if context.data is None:
+            raise ValueError("Context DataFrame is required.")
+        df = context.data
+        columns = [columns] if isinstance(columns, str) else columns
+        match source_format:
+            # convert always to timestamp first
+            case "string" | "date" | "unixtime":
+                for column in columns:
+                    df = df.withColumn(column, F.to_timestamp(F.col(column)))
+            case "unixtime_ms":
+                for column in columns:
+                    df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
+            case "timestamp":
+                pass
+            case _:
+                raise ValueError(f"Unknown source_format {source_format}")
+        match target_format:
+            # convert from timestamp to desired output type and format
+            case "timestamp":
+                pass
+            case "unixtime":
+                for column in columns:
+                    df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
+            case "date":
+                for column in columns:
+                    df = df.withColumn(column, F.to_date(F.col(column)))
+            case _:
+                try:
+                    for column in columns:
+                        df = df.withColumn(column, F.date_format(F.col(column), target_format))
+                except (IllegalArgumentException, AnalysisException) as e:
+                    raise ValueError(f"Invalid target_format {target_format}") from e
+        return context.from_existing(data=df)

cloe_nessy/pipeline/actions/transform_deduplication.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import random
-import string
 from typing import Any
 import pyspark.sql.functions as F
 import pyspark.sql.types as T
 from pyspark.sql import Window
+from ...utils.column_names import generate_unique_column_name
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
@@ -105,18 +104,14 @@ class TransformDeduplication(PipelineAction):
         else:
             order_by_list = [F.col(col_name).asc() for col_name in order_by_columns]
-        # create the window specification
-        window_specification = Window.partitionBy(key_columns).orderBy(order_by_list)
-        # generate a column name that is not in the input dataframe
-        def generate_random_string(length):
-            return "".join(random.choice(string.ascii_uppercase) for _ in range(length))
+        window_specification = (
+            Window.partitionBy(key_columns)
+            .orderBy(order_by_list)
+            .rowsBetween(Window.unboundedPreceding, Window.currentRow)
+        )
-        row_number_col_name = generate_random_string(20)
-        while row_number_col_name in context.data.columns:
-            row_number_col_name = generate_random_string(20)
+        row_number_col_name = generate_unique_column_name(existing_columns=set(context.data.columns), prefix="row_num")
-        # drop the duplicates
         df = (
             context.data.withColumn(row_number_col_name, F.row_number().over(window_specification))
             .filter(F.col(row_number_col_name) == 1)

cloe_nessy/pipeline/actions/transform_hash_columns.py CHANGED Viewed

@@ -132,13 +132,13 @@ class TransformHashColumnsAction(PipelineAction):
             action: TRANSFORM_HASH_COLUMNS
             options:
                 hash_config:
-                - hashed_column1:
-                    columns: ["column1", "column2"]
-                    algorithm: "sha2"
-                    bits: 224
-                - hashed_column2:
-                    columns: ["column1"]
-                    algorithm: "crc32"
+                    hashed_column1:
+                        columns: ["column1", "column2"]
+                        algorithm: "sha2"
+                        bits: 224
+                    hashed_column2:
+                        columns: ["column1"]
+                        algorithm: "crc32"
         ```
     Given a DataFrame `df` with the following structure:

cloe_nessy/pipeline/actions/write_catalog_table.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any
+from ...integration.delta_loader import consume_delta_load
 from ...integration.writer import CatalogWriter
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
@@ -63,6 +64,10 @@ class WriteCatalogTableAction(PipelineAction):
         if table_identifier is None:
             raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
+        runtime_info = getattr(context, "runtime_info", None)
+        if runtime_info and runtime_info.get("is_delta_load"):
+            consume_delta_load(runtime_info)
         writer = CatalogWriter()
         writer.write_table(
             df=context.data,  # type: ignore

cloe_nessy/pipeline/actions/write_delta_append.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any
+from ...integration.delta_loader import consume_delta_load
 from ...integration.writer import DeltaAppendWriter
 from ...models.adapter import UnityCatalogAdapter
 from ...pipeline import PipelineAction, PipelineContext
@@ -10,6 +11,15 @@ class WriteDeltaAppendAction(PipelineAction):
     The WriteDeltaAppendAction appends a Dataframe to Delta Table.
+    Example:
+        ```yaml
+        Write Delta Append:
+            action: WRITE_DELTA_APPEND
+            options:
+                table_identifier: my_catalog.my_schema.my_table
+                ignore_empty_df: false
+        ```
     Returns:
         None.
     """
@@ -66,4 +76,9 @@ class WriteDeltaAppendAction(PipelineAction):
             ignore_empty_df=ignore_empty_df,
             options=options,
         )
+        runtime_info = getattr(context, "runtime_info", None)
+        if runtime_info and runtime_info.get("is_delta_load"):
+            consume_delta_load(runtime_info)
         return context.from_existing()

cloe_nessy/pipeline/actions/write_delta_merge.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any
+from ...integration.delta_loader import consume_delta_load
 from ...integration.writer import DeltaMergeWriter
 from ...models.adapter import UnityCatalogAdapter
 from ...pipeline import PipelineAction, PipelineContext
@@ -10,6 +11,24 @@ class WriteDeltaMergeAction(PipelineAction):
     The MergeIntoDeltaAction merges a Dataframe to Delta Table.
+    Example:
+        ```yaml
+        Write Delta Merge:
+            action: WRITE_DELTA_MERGE
+            options:
+                table_identifier: my_catalog.my_schema.my_table
+                key_columns:
+                    - id
+                    - customer_id
+                cols_to_update:
+                    - name
+                    - email
+                    - updated_at
+                when_matched_update: true
+                when_not_matched_insert: true
+                use_partition_pruning: true
+        ```
     Returns:
         None.
     """
@@ -112,6 +131,10 @@ class WriteDeltaMergeAction(PipelineAction):
             ignore_empty_df=ignore_empty_df,
         )
+        runtime_info = getattr(context, "runtime_info", None)
+        if runtime_info and runtime_info.get("is_delta_load"):
+            consume_delta_load(runtime_info)
         if refresh_table:
             delta_merge_writer.table_manager.refresh_table(table_identifier=context.table_metadata.identifier)

cloe_nessy/pipeline/actions/write_file.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any
+from ...integration.delta_loader import consume_delta_load
 from ...integration.writer import FileWriter
 from ...pipeline import PipelineAction, PipelineContext
@@ -21,7 +22,7 @@ class WriteFileAction(PipelineAction):
                 mode: "append"
                 is_stream: False
                 options:
-                    mergeSchema: "true"
+                    mergeSchema: true
         ```
     """
@@ -91,4 +92,8 @@ class WriteFileAction(PipelineAction):
                 options=options,
             )
+        runtime_info = getattr(context, "runtime_info", None)
+        if runtime_info and runtime_info.get("is_delta_load"):
+            consume_delta_load(runtime_info)
         return context.from_existing()

cloe_nessy/pipeline/utils/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Pipeline utility modules."""
+from .delta_load_utils import set_delta_load_info
+__all__ = ["set_delta_load_info"]

cloe_nessy/pipeline/utils/delta_load_utils.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Utilities for managing delta load information in pipeline runtime context."""
+from typing import Any
+def set_delta_load_info(
+    table_identifier: str,
+    delta_load_options: dict[str, Any],
+    runtime_info: dict[str, Any],
+) -> dict[str, Any]:
+    """Update the runtime information dictionary with delta load options for a specific table.
+    If delta load options are provided, this function marks the runtime as a delta load and
+    stores the options under the given table identifier within the 'delta_load_options' key
+    of the runtime_info dictionary.
+    The method uses `setdefault("delta_load_options", {})` to ensure that the 'delta_load_options'
+    key exists in the runtime_info dictionary. If the key is not present, it initializes it with
+    an empty dictionary. This prevents overwriting existing delta load options and allows
+    multiple tables' options to be stored without losing previous entries.
+    Args:
+        table_identifier: The identifier for the table (can be table name or file path).
+        delta_load_options: Options specific to the delta load for the table.
+        runtime_info: The runtime information dictionary to update.
+    Returns:
+        The updated runtime information dictionary with delta load details.
+    """
+    if not delta_load_options:
+        return runtime_info
+    runtime_info["is_delta_load"] = True
+    runtime_info.setdefault("delta_load_options", {})[table_identifier] = delta_load_options
+    return runtime_info

cloe_nessy/utils/column_names.py ADDED Viewed

@@ -0,0 +1,9 @@
+import uuid
+def generate_unique_column_name(existing_columns: set[str], prefix: str = "temp_col") -> str:
+    """Generate a unique column name that doesn't conflict with existing columns."""
+    base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
+    while base_name in existing_columns:
+        base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
+    return base_name

{cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cloe-nessy
-Version: 0.3.16.6b0
+Version: 0.3.17.0
 Summary: Your friendly datalake monster.
 Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
 License: MIT
@@ -58,12 +58,12 @@ Extract-Transform-Load (ETL) Workflow.
 When you are contributing, please refer to our Contribution Guide in the *nessy*
 Docs
-[here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
+[here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
 ## Usage
 Please find the User Guide
-[here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
+[here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
 ## Contact

cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.17.0__py3-none-any.whl

cloe-nessy 0.3.16.6b0py3-none-any.whl → 0.3.17.0py3-none-any.whl