PyPI - cloe-nessy - Versions diffs - 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

cloe-nessy 0.3.17.0py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cloe_nessy/clients/api_client/__init__.py +10 -1
cloe_nessy/clients/api_client/api_client.py +19 -8
cloe_nessy/clients/api_client/api_response.py +7 -4
cloe_nessy/clients/api_client/pagination_config.py +84 -0
cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
cloe_nessy/integration/reader/__init__.py +2 -2
cloe_nessy/integration/reader/api_reader.py +463 -72
cloe_nessy/integration/reader/catalog_reader.py +49 -10
cloe_nessy/integration/reader/excel_reader.py +3 -3
cloe_nessy/integration/reader/file_reader.py +3 -1
cloe_nessy/integration/reader/reader.py +1 -1
cloe_nessy/integration/writer/catalog_writer.py +64 -2
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
cloe_nessy/models/column.py +3 -2
cloe_nessy/models/schema.py +1 -0
cloe_nessy/models/templates/create_table.sql.j2 +22 -0
cloe_nessy/object_manager/table_manager.py +29 -7
cloe_nessy/pipeline/actions/__init__.py +1 -1
cloe_nessy/pipeline/actions/read_api.py +272 -75
cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
cloe_nessy/pipeline/actions/read_excel.py +1 -1
cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
cloe_nessy/pipeline/actions/transform_decode.py +2 -1
cloe_nessy/pipeline/actions/transform_join.py +98 -24
cloe_nessy/pipeline/actions/transform_union.py +2 -2
cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
cloe_nessy/pipeline/pipeline_config.py +2 -0
cloe_nessy/pipeline/pipeline_context.py +1 -1
cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
cloe_nessy/pipeline/pipeline_step.py +2 -0
cloe_nessy/session/__init__.py +2 -1
cloe_nessy/session/pyspark_compat.py +15 -0
cloe_nessy/session/session_manager.py +1 -1
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
{cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1

cloe_nessy/integration/reader/excel_reader.py CHANGED Viewed

@@ -3,7 +3,8 @@ from typing import Any
 import pandas as pd
 import pyspark.sql.functions as F
-from pyspark.sql import DataFrame
+from cloe_nessy.session import DataFrame
 from .reader import BaseReader
@@ -27,7 +28,6 @@ class ExcelDataFrameReader(BaseReader):
     def read(
         self,
         location: str,
-        *,
         sheet_name: str | int | list = 0,
         header: int | list[int] = 0,
         index_col: int | list[int] | None = None,
@@ -43,7 +43,7 @@ class ExcelDataFrameReader(BaseReader):
         options: dict | None = None,
         load_as_strings: bool = False,
         add_metadata_column: bool = False,
-        **kwargs: Any,
+        **_: Any,
     ) -> DataFrame:
         """Reads Excel file on specified location and returns DataFrame.

cloe_nessy/integration/reader/file_reader.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from typing import Any
 import pyspark.sql.functions as F
-from pyspark.sql import DataFrame, DataFrameReader
+from pyspark.sql import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
 from pyspark.sql.types import StructType
+from cloe_nessy.session import DataFrame
 from ...file_utilities import get_file_paths
 from ..delta_loader.delta_load_options import DeltaLoadOptions
 from ..delta_loader.delta_loader_factory import DeltaLoaderFactory

cloe_nessy/integration/reader/reader.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any
-from pyspark.sql import DataFrame, SparkSession
+from cloe_nessy.session import DataFrame, SparkSession
 from ...logging.logger_mixin import LoggerMixin
 from ...session import SessionManager

cloe_nessy/integration/writer/catalog_writer.py CHANGED Viewed

@@ -1,11 +1,11 @@
-from pyspark.sql import DataFrame
+from cloe_nessy.session import DataFrame
 class CatalogWriter:
     """A writer for Catalog tables."""
     @staticmethod
-    def write_table(
+    def write(
         df: DataFrame | None,
         table_identifier: str | None,
         partition_by: str | list[str] | None = None,
@@ -46,3 +46,65 @@ class CatalogWriter:
         if options is None:
             options = {}
         df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
+    @staticmethod
+    def write_stream(
+        df: DataFrame | None,
+        table_identifier: str | None,
+        checkpoint_location: str | None = None,
+        trigger_dict: dict | None = None,
+        options: dict[str, str] | None = None,
+        mode: str = "append",
+        await_termination: bool = False,
+    ) -> None:
+        """Write a streaming DataFrame to a Unity Catalog table.
+        Args:
+            df: The streaming DataFrame to write.
+            table_identifier: The table identifier in the Unity Catalog in the
+                              format 'catalog.schema.table'.
+            checkpoint_location: Location for checkpointing. Required for stream recovery.
+            trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
+                Supported keys include:
+                - "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
+                - "once": Processes all available data once and then stops.
+                - "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
+                - "availableNow": Processes all available data immediately and then stops.
+                If nothing is provided, the default is {"availableNow": True}.
+            options: PySpark options for the DataFrame streaming write operation.
+            mode: The write mode. For streaming, typically "append".
+            await_termination: If True, the function will wait for the streaming
+                query to finish before returning.
+        Raises:
+            ValueError: If the mode is not supported for streaming operations.
+            ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
+            ValueError: If the DataFrame is None.
+            ValueError: If checkpoint_location is not provided.
+        """
+        if mode not in ("append", "complete", "update"):
+            raise ValueError("mode must be one of append, complete, update for streaming operations")
+        if not table_identifier:
+            raise ValueError("table_identifier is required")
+        elif not isinstance(table_identifier, str):
+            raise ValueError("table_identifier must be a string")
+        elif len(table_identifier.split(".")) != 3:
+            raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
+        if not df:
+            raise ValueError("df is required, but was None.")
+        if not checkpoint_location:
+            raise ValueError("checkpoint_location is required for streaming operations")
+        if options is None:
+            options = {}
+        if trigger_dict is None:
+            trigger_dict = {"availableNow": True}
+        stream_writer = df.writeStream.format("delta").outputMode(mode)
+        stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
+        stream_writer.trigger(**trigger_dict)
+        query = stream_writer.toTable(table_identifier)
+        if await_termination:
+            query.awaitTermination()

cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py CHANGED Viewed

@@ -196,7 +196,11 @@ class DeltaMergeWriter(BaseDeltaWriter):
         config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
-        delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
+        delta_table = self.table_manager.get_delta_table(
+            table=table,
+            location=storage_path if not table else None,
+            spark=data_frame.sparkSession,
+        )
         match_conditions = self._build_match_conditions(data_frame, config)

cloe_nessy/models/column.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
 COLUMN_DATA_TYPE_LIST = {
     "string",
+    "decimal",
     "integer",
     "int",
     "smallint",
@@ -31,7 +32,7 @@ class Column(BaseModel):
     nullable: bool = True
     default_value: Any = None
     generated: str | None = None
-    properties: dict[str, Any] = Field(default_factory=dict)
+    business_properties: dict[str, Any] = Field(default_factory=dict)
     comment: str | None = None
     @field_validator("data_type", mode="before")
@@ -43,7 +44,7 @@ class Column(BaseModel):
         """
         val = raw.lower()
         base_data_types = re.findall(r"\b[a-z]+\b", val)
-        forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
+        forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>, ]+", val)
         if forbidden_characters:
             raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")

cloe_nessy/models/schema.py CHANGED Viewed

@@ -43,6 +43,7 @@ class Schema(ReadInstancesMixin):
             raise FileNotFoundError("Schema file not found.")
         schema, schema_errors = super().read_instance_from_file(processed_instance_path)
+        table_errors: list[ValidationErrorType] = []
         if schema:
             schema.storage_path = "" if not schema.storage_path else schema.storage_path
             tables, table_errors = Table.read_instances_from_directory(

cloe_nessy/models/templates/create_table.sql.j2 CHANGED Viewed

@@ -13,6 +13,14 @@ USING delta
 {% if table.storage_path %}
 LOCATION '{{ table.storage_path }}'
 {% endif %}
+{% if table.properties %}
+TBLPROPERTIES (
+    {%- for key, value in table.properties.items() %}
+        {%- if not loop.first %}, {% endif -%}
+        '{{key}}' = '{{value}}'
+    {%- endfor -%}
+)
+{% endif %}
 {% if table.partition_by -%}
     {%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
         {%- for column in table.partition_by -%}
@@ -34,3 +42,17 @@ ALTER TABLE {{ table.escaped_identifier }} ADD CONSTRAINT {{constraint.name}} CH
 {%- if table.comment %}
 COMMENT ON TABLE {{ table.escaped_identifier }} IS '{{ table.comment }}';
 {%- endif %}
+{# Tags do not yet work in Databricks
+{%- if table.business_properties %}
+{%- for tag_key, tag_value in table.business_properties.items() %}
+SET TAG ON TABLE {{ table.escaped_identifier }} `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
+{%- endfor %}
+{%- endif %}
+{%- for column in table.columns %}
+{%- if column.business_properties %}
+{%- for tag_key, tag_value in column.business_properties.items() %}
+SET TAG ON COLUMN {{ table.escaped_identifier }}.`{{ column.name }}` `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
+{%- endfor %}
+{%- endif %}
+{%- endfor %} #}

cloe_nessy/object_manager/table_manager.py CHANGED Viewed

@@ -110,7 +110,7 @@ class TableManager(LoggerMixin):
         self._spark.sql(f"USE CATALOG {table.catalog};")
         self._spark.sql(f"USE SCHEMA {table.schema};")
         for statement in table.get_create_statement(replace=replace).split(";"):
-            if statement and statement != "\n":
+            if statement and statement.strip():
                 self._spark.sql(statement)
     def drop_table(
@@ -186,6 +186,9 @@ class TableManager(LoggerMixin):
     def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
         """Get the DeltaTable object from the Table objects location or a location string.
+        For managed tables, uses the table identifier to access the DeltaTable.
+        For external tables or when a location is provided, uses the storage path.
         Args:
             table: A Table object representing the Delta table.
             location: A string representing the table location.
@@ -195,17 +198,35 @@ class TableManager(LoggerMixin):
             The DeltaTable object corresponding to the given Table object or location string.
         Raises:
-            ValueError: If neither table nor location is provided, or if both are provided.
+            ValueError: If neither table nor location is provided.
         """
-        if (table is None and location is None) or (table is not None and location is not None):
+        if table is None and location is None:
             raise ValueError(
-                f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
+                f"Either table or location must be provided. Table: {table}, location: {location}",
+            )
+        spark_session = spark or self._spark
+        if table is not None and location is not None:
+            self._console_logger.info(
+                f"Both table ({table.identifier}) and location ({location}) provided. Using table object as priority."
             )
         if table is not None:
-            location = str(table.storage_path)
+            if table.is_external is False:
+                self._console_logger.info(f"Getting DeltaTable object for managed table: {table.identifier}")
+                return DeltaTable.forName(spark_session, table.identifier)
+            table_location = str(table.storage_path)
+            self._console_logger.info(f"Getting DeltaTable object for external table location: {table_location}")
+            return DeltaTable.forPath(spark_session, table_location)
+        self._console_logger.info(f"No table object provided, using location: {location}")
+        if location is None:
+            self._console_logger.error("Location is None - this should not happen!")
+            raise ValueError("Location cannot be None when no table object is provided")
         self._console_logger.info(f"Getting DeltaTable object for location: {location}")
-        return DeltaTable.forPath(spark or self._spark, str(location))
+        return DeltaTable.forPath(spark_session, str(location))
     def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
         """Checks if a table exists in the catalog.
@@ -235,9 +256,10 @@ class TableManager(LoggerMixin):
                 raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
         query_result = self._spark.sql(
+            # Using both upper and lower case to ensure compatibility with case changes in Databricks
             f"""
                 SELECT 1 FROM {catalog}.information_schema.tables
-                WHERE table_name = '{table_name}'
+                WHERE table_name in ('{table_name}', '{table_name.lower()}')
                 AND table_schema = '{schema}'
                 LIMIT 1""",
         )

cloe_nessy/pipeline/actions/__init__.py CHANGED Viewed

@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
 # Register all subclasses dynamically as enum using their "name" attribute as
 # key. We need to do this here, because otherwise we don't get all subclasses
 # from a relative import of PipelineAction
-PipelineActionType = Enum("PipelineActionType", pipeline_actions)  # type: ignore
+PipelineActionType = Enum("PipelineActionType", pipeline_actions)  # type: ignore[misc]
 __all__ = [
     "ReadAPIAction",

cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl

cloe-nessy 0.3.17.0py3-none-any.whl → 0.3.19py3-none-any.whl