PyPI - datazone-sdk - Versions diffs - 6.0.1.dev9__tar.gz → 6.0.1.dev11__tar.gz - Mend

datazone-sdk 6.0.1.dev9tar.gz → 6.0.1.dev11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datazone-sdk
-Version: 6.0.1.dev9
+Version: 6.0.1.dev11
 Summary: Database and Delta storage client library for working with Delta Lake tables
 Author: Team Enigma
 Author-email: enigma@energinet.dk

{datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/table.py RENAMED Viewed

@@ -12,12 +12,12 @@ from .slicing import HyperSlice
 def _sql_literal(value: Any) -> str:
     """Render a Python value as a *type-correct* SQL literal for Delta predicates.
-    Earlier versions quoted every value as a string (``f"{col} {op} '{val}'"``),
-    which worked because the old delta-rs/datafusion implicitly cast the string
-    literal to the column type. The upgraded datafusion type-checks predicates
-    strictly and rejects comparing a non-string column to a string literal
-    (e.g. ``Timestamp(us, "UTC") <= Utf8View``). So numbers are left unquoted,
-    booleans become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
+    datafusion type-checks predicates and does not always coerce a quoted
+    string literal to the column type (in particular not inside ``IN (...)``,
+    where a string compared to a timestamp column raises
+    ``Timestamp(us, "UTC") <= Utf8View``). Rendering each value with its real
+    type avoids relying on that coercion: numbers are left unquoted, booleans
+    become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
     strings/dates are quoted (and escaped).
     """
     if isinstance(value, dt.datetime):
@@ -38,8 +38,16 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
     """Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
     Needed because delta-rs overwrite/delete operations accept a SQL predicate
-    string, not the tuple filter format the SDK uses everywhere else. ``in`` is
-    expanded to ``OR`` of equalities since the predicate dialect has no ``IN``.
+    string, not the tuple filter format the SDK uses everywhere else.
+    ``in`` is rendered as an OR-of-equalities rather than SQL ``IN (...)`` on
+    purpose: datafusion coerces a quoted string literal to the column type for
+    the binary ``=`` operator, but NOT inside ``IN (...)``. So a timestamp
+    filter written as ``time_utc IN ('2025-01-01T...')`` raises
+    ``Invalid comparison operation: Timestamp(us, "UTC") <= Utf8View``, while
+    ``(time_utc = '2025-01-01T...')`` is coerced and works. (For columns whose
+    type already matches the literal, e.g. ints, ``IN`` would be fine -- but the
+    OR form is correct for every column type.)
     """
     if len(dnf) == 0:
         return "1=1"
@@ -61,28 +69,14 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
     return " AND ".join(sql_parts)
-def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
-    """Convert a single tuple filter to a Polars expression.
+def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
+    """Return ``None`` for an empty filter list.
-    Needed so the SDK's ``(column, op, value)`` filters can be pushed down as
-    predicates to the native Polars Delta reader in ``Table.read``, instead of
-    PyArrow filters, whose compute kernels fail on delta-rs ``string_view``
-    columns (``ArrowNotImplementedError``).
+    ``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
+    must pass ``None`` rather than an empty list when there is nothing to filter
+    on.
     """
-    col, op, val = filter_
-    if op == "=":
-        return pl.col(col) == val
-    if op == "in":
-        return pl.col(col).is_in(val)
-    if op == ">=":
-        return pl.col(col) >= val
-    if op == "<=":
-        return pl.col(col) <= val
-    if op == ">":
-        return pl.col(col) > val
-    if op == "<":
-        return pl.col(col) < val
-    raise ValueError(f"Unsupported operation: {op}")
+    return filters if len(filters) > 0 else None
 class Table:
@@ -138,19 +132,14 @@ class Table:
     ) -> pl.DataFrame:
         """Read from Delta table.
-        All filters are pushed down to the native Delta reader. Partition
-        filters (including filters derived from generated partition columns)
-        result in partition pruning, while non-partition filters are pushed
-        down to the Parquet reader as predicates.
+        Filters are split into two groups so each is handled by the engine
+        best suited for it:
-        We use the native Polars Delta reader (`pl.scan_delta`) rather than
-        `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
-        filter path evaluates predicates with PyArrow compute kernels, which
-        do not implement comparisons for the `string_view` type returned by
-        delta-rs. That mismatch raises errors such as
-        `ArrowNotImplementedError: Function 'greater_equal' has no kernel
-        matching input types (string_view, string_view)`. The native reader
-        evaluates predicates in its own engine and avoids this entirely.
+        * Partition filters (including filters derived from generated partition
+          columns) go to ``partitions=`` so delta-rs prunes whole partitions
+          before any data is read.
+        * Non-partition filters go to ``filters=`` and are pushed down to the
+          Parquet reader.
         Args:
             hyper_slice (HyperSlice): Hyper slice used to filter data.
@@ -159,29 +148,26 @@ class Table:
         if hyper_slice is None:
             hyper_slice = []
-        # Generated filters add predicates on generated partition columns
-        # (e.g. `date_utc` derived from `time_utc`) so the native reader can
-        # prune partitions even when the caller only filters the base column.
-        pushdown_slice = self.schema().add_generated_filters(hyper_slice)
-        # `credential_provider=None` makes Polars pass `storage_options` straight
-        # to the native object store, exactly like `dl.DeltaTable(...)` does. We
-        # avoid the default `credential_provider="auto"`, which would build a
-        # separate Polars-managed Azure credential and could diverge from the
-        # managed-identity / Azure CLI auth used everywhere else in this class.
-        lazy_frame = pl.scan_delta(
-            self.url,
-            storage_options=self.storage_options,
-            credential_provider=None,
-        )
+        # add generated filters to hyperslice
+        hyper_slice = self.schema().add_generated_filters(hyper_slice)
+        delta_table = self.delta_table
+        partition_cols = delta_table.metadata().partition_columns
-        for filter_ in pushdown_slice:
-            lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
+        if len(hyper_slice) == 0:
+            file_filters = None
+            partition_filters = None
+        else:
+            file_filters = hyper_slice
+            partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
-        if columns is not None:
-            lazy_frame = lazy_frame.select(columns)
+        pyarrow_table_existing_data = delta_table.to_pyarrow_table(
+            columns=columns,
+            partitions=partition_filters,
+            filters=file_filters,
+        )
-        return lazy_frame.collect()
+        return pl.from_arrow(pyarrow_table_existing_data)
     def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
         """Convert Polars dataframe to pyarrow table with casted schema.

{datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "datazone-sdk"
-version = "6.0.1.dev9"
+version = "6.0.1.dev11"
 description = "Database and Delta storage client library for working with Delta Lake tables"
 authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
 requires-python = ">=3.10"