datazone-sdk 6.0.1.dev9__tar.gz → 6.0.1.dev11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/table.py +46 -60
  3. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/pyproject.toml +1 -1
  4. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/README.md +0 -0
  5. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/__init__.py +0 -0
  6. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/backtesting.py +0 -0
  7. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/caching/__init__.py +0 -0
  8. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/caching/parquet.py +0 -0
  9. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/__init__.py +0 -0
  10. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/base.py +0 -0
  11. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/cached.py +0 -0
  12. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/snapshot.py +0 -0
  13. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/standard.py +0 -0
  14. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/__init__.py +0 -0
  15. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev9
3
+ Version: 6.0.1.dev11
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -12,12 +12,12 @@ from .slicing import HyperSlice
12
12
  def _sql_literal(value: Any) -> str:
13
13
  """Render a Python value as a *type-correct* SQL literal for Delta predicates.
14
14
 
15
- Earlier versions quoted every value as a string (``f"{col} {op} '{val}'"``),
16
- which worked because the old delta-rs/datafusion implicitly cast the string
17
- literal to the column type. The upgraded datafusion type-checks predicates
18
- strictly and rejects comparing a non-string column to a string literal
19
- (e.g. ``Timestamp(us, "UTC") <= Utf8View``). So numbers are left unquoted,
20
- booleans become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
15
+ datafusion type-checks predicates and does not always coerce a quoted
16
+ string literal to the column type (in particular not inside ``IN (...)``,
17
+ where a string compared to a timestamp column raises
18
+ ``Timestamp(us, "UTC") <= Utf8View``). Rendering each value with its real
19
+ type avoids relying on that coercion: numbers are left unquoted, booleans
20
+ become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
21
21
  strings/dates are quoted (and escaped).
22
22
  """
23
23
  if isinstance(value, dt.datetime):
@@ -38,8 +38,16 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
38
38
  """Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
39
39
 
40
40
  Needed because delta-rs overwrite/delete operations accept a SQL predicate
41
- string, not the tuple filter format the SDK uses everywhere else. ``in`` is
42
- expanded to ``OR`` of equalities since the predicate dialect has no ``IN``.
41
+ string, not the tuple filter format the SDK uses everywhere else.
42
+
43
+ ``in`` is rendered as an OR-of-equalities rather than SQL ``IN (...)`` on
44
+ purpose: datafusion coerces a quoted string literal to the column type for
45
+ the binary ``=`` operator, but NOT inside ``IN (...)``. So a timestamp
46
+ filter written as ``time_utc IN ('2025-01-01T...')`` raises
47
+ ``Invalid comparison operation: Timestamp(us, "UTC") <= Utf8View``, while
48
+ ``(time_utc = '2025-01-01T...')`` is coerced and works. (For columns whose
49
+ type already matches the literal, e.g. ints, ``IN`` would be fine -- but the
50
+ OR form is correct for every column type.)
43
51
  """
44
52
  if len(dnf) == 0:
45
53
  return "1=1"
@@ -61,28 +69,14 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
61
69
  return " AND ".join(sql_parts)
62
70
 
63
71
 
64
- def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
65
- """Convert a single tuple filter to a Polars expression.
72
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
73
+ """Return ``None`` for an empty filter list.
66
74
 
67
- Needed so the SDK's ``(column, op, value)`` filters can be pushed down as
68
- predicates to the native Polars Delta reader in ``Table.read``, instead of
69
- PyArrow filters, whose compute kernels fail on delta-rs ``string_view``
70
- columns (``ArrowNotImplementedError``).
75
+ ``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
76
+ must pass ``None`` rather than an empty list when there is nothing to filter
77
+ on.
71
78
  """
72
- col, op, val = filter_
73
- if op == "=":
74
- return pl.col(col) == val
75
- if op == "in":
76
- return pl.col(col).is_in(val)
77
- if op == ">=":
78
- return pl.col(col) >= val
79
- if op == "<=":
80
- return pl.col(col) <= val
81
- if op == ">":
82
- return pl.col(col) > val
83
- if op == "<":
84
- return pl.col(col) < val
85
- raise ValueError(f"Unsupported operation: {op}")
79
+ return filters if len(filters) > 0 else None
86
80
 
87
81
 
88
82
  class Table:
@@ -138,19 +132,14 @@ class Table:
138
132
  ) -> pl.DataFrame:
139
133
  """Read from Delta table.
140
134
 
141
- All filters are pushed down to the native Delta reader. Partition
142
- filters (including filters derived from generated partition columns)
143
- result in partition pruning, while non-partition filters are pushed
144
- down to the Parquet reader as predicates.
135
+ Filters are split into two groups so each is handled by the engine
136
+ best suited for it:
145
137
 
146
- We use the native Polars Delta reader (`pl.scan_delta`) rather than
147
- `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
148
- filter path evaluates predicates with PyArrow compute kernels, which
149
- do not implement comparisons for the `string_view` type returned by
150
- delta-rs. That mismatch raises errors such as
151
- `ArrowNotImplementedError: Function 'greater_equal' has no kernel
152
- matching input types (string_view, string_view)`. The native reader
153
- evaluates predicates in its own engine and avoids this entirely.
138
+ * Partition filters (including filters derived from generated partition
139
+ columns) go to ``partitions=`` so delta-rs prunes whole partitions
140
+ before any data is read.
141
+ * Non-partition filters go to ``filters=`` and are pushed down to the
142
+ Parquet reader.
154
143
 
155
144
  Args:
156
145
  hyper_slice (HyperSlice): Hyper slice used to filter data.
@@ -159,29 +148,26 @@ class Table:
159
148
  if hyper_slice is None:
160
149
  hyper_slice = []
161
150
 
162
- # Generated filters add predicates on generated partition columns
163
- # (e.g. `date_utc` derived from `time_utc`) so the native reader can
164
- # prune partitions even when the caller only filters the base column.
165
- pushdown_slice = self.schema().add_generated_filters(hyper_slice)
166
-
167
- # `credential_provider=None` makes Polars pass `storage_options` straight
168
- # to the native object store, exactly like `dl.DeltaTable(...)` does. We
169
- # avoid the default `credential_provider="auto"`, which would build a
170
- # separate Polars-managed Azure credential and could diverge from the
171
- # managed-identity / Azure CLI auth used everywhere else in this class.
172
- lazy_frame = pl.scan_delta(
173
- self.url,
174
- storage_options=self.storage_options,
175
- credential_provider=None,
176
- )
151
+ # add generated filters to hyperslice
152
+ hyper_slice = self.schema().add_generated_filters(hyper_slice)
153
+
154
+ delta_table = self.delta_table
155
+ partition_cols = delta_table.metadata().partition_columns
177
156
 
178
- for filter_ in pushdown_slice:
179
- lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
157
+ if len(hyper_slice) == 0:
158
+ file_filters = None
159
+ partition_filters = None
160
+ else:
161
+ file_filters = hyper_slice
162
+ partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
180
163
 
181
- if columns is not None:
182
- lazy_frame = lazy_frame.select(columns)
164
+ pyarrow_table_existing_data = delta_table.to_pyarrow_table(
165
+ columns=columns,
166
+ partitions=partition_filters,
167
+ filters=file_filters,
168
+ )
183
169
 
184
- return lazy_frame.collect()
170
+ return pl.from_arrow(pyarrow_table_existing_data)
185
171
 
186
172
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
187
173
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev9"
3
+ version = "6.0.1.dev11"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"