datazone-sdk 6.0.1.dev9__tar.gz → 6.0.1.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/table.py +53 -36
  3. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/pyproject.toml +1 -1
  4. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/README.md +0 -0
  5. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/__init__.py +0 -0
  6. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/backtesting.py +0 -0
  7. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/caching/__init__.py +0 -0
  8. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/caching/parquet.py +0 -0
  9. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/__init__.py +0 -0
  10. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/base.py +0 -0
  11. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/cached.py +0 -0
  12. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/snapshot.py +0 -0
  13. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/standard.py +0 -0
  14. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/__init__.py +0 -0
  15. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev9
3
+ Version: 6.0.1.dev10
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -61,13 +61,21 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
61
61
  return " AND ".join(sql_parts)
62
62
 
63
63
 
64
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
65
+ """Return ``None`` for an empty filter list.
66
+
67
+ ``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
68
+ must pass ``None`` rather than an empty list when there is nothing to filter
69
+ on.
70
+ """
71
+ return filters if len(filters) > 0 else None
72
+
73
+
64
74
  def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
65
75
  """Convert a single tuple filter to a Polars expression.
66
76
 
67
- Needed so the SDK's ``(column, op, value)`` filters can be pushed down as
68
- predicates to the native Polars Delta reader in ``Table.read``, instead of
69
- PyArrow filters, whose compute kernels fail on delta-rs ``string_view``
70
- columns (``ArrowNotImplementedError``).
77
+ Used to apply residual partition predicates as a Polars post-filter in
78
+ ``Table.read`` after delta-rs partition pruning has narrowed the files read.
71
79
  """
72
80
  col, op, val = filter_
73
81
  if op == "=":
@@ -138,19 +146,16 @@ class Table:
138
146
  ) -> pl.DataFrame:
139
147
  """Read from Delta table.
140
148
 
141
- All filters are pushed down to the native Delta reader. Partition
142
- filters (including filters derived from generated partition columns)
143
- result in partition pruning, while non-partition filters are pushed
144
- down to the Parquet reader as predicates.
149
+ Filters are split into three groups so each is handled by the engine
150
+ best suited for it:
145
151
 
146
- We use the native Polars Delta reader (`pl.scan_delta`) rather than
147
- `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
148
- filter path evaluates predicates with PyArrow compute kernels, which
149
- do not implement comparisons for the `string_view` type returned by
150
- delta-rs. That mismatch raises errors such as
151
- `ArrowNotImplementedError: Function 'greater_equal' has no kernel
152
- matching input types (string_view, string_view)`. The native reader
153
- evaluates predicates in its own engine and avoids this entirely.
152
+ * Partition filters (including filters derived from generated partition
153
+ columns) go to ``partitions=`` so delta-rs prunes whole partitions
154
+ before any data is read.
155
+ * Non-partition filters go to ``filters=`` and are pushed down to the
156
+ Parquet reader.
157
+ * Residual partition predicates that cannot be expressed as a partition
158
+ filter are applied afterwards as a Polars post-filter.
154
159
 
155
160
  Args:
156
161
  hyper_slice (HyperSlice): Hyper slice used to filter data.
@@ -159,29 +164,41 @@ class Table:
159
164
  if hyper_slice is None:
160
165
  hyper_slice = []
161
166
 
162
- # Generated filters add predicates on generated partition columns
163
- # (e.g. `date_utc` derived from `time_utc`) so the native reader can
164
- # prune partitions even when the caller only filters the base column.
165
- pushdown_slice = self.schema().add_generated_filters(hyper_slice)
166
-
167
- # `credential_provider=None` makes Polars pass `storage_options` straight
168
- # to the native object store, exactly like `dl.DeltaTable(...)` does. We
169
- # avoid the default `credential_provider="auto"`, which would build a
170
- # separate Polars-managed Azure credential and could diverge from the
171
- # managed-identity / Azure CLI auth used everywhere else in this class.
172
- lazy_frame = pl.scan_delta(
173
- self.url,
174
- storage_options=self.storage_options,
175
- credential_provider=None,
176
- )
167
+ # Generated filters are an optimization for partition pruning. Keep all
168
+ # partition filters out of row filters because partition values may be
169
+ # represented differently by PyArrow than by Delta's partition pruning.
170
+ partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
177
171
 
178
- for filter_ in pushdown_slice:
179
- lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
172
+ delta_table = self.delta_table
173
+ partition_cols = delta_table.metadata().partition_columns
180
174
 
181
- if columns is not None:
182
- lazy_frame = lazy_frame.select(columns)
175
+ if len(hyper_slice) == 0:
176
+ file_filters = None
177
+ partition_filters = None
178
+ post_filters = []
179
+ else:
180
+ file_filters = _empty_to_none(
181
+ [f for f in hyper_slice if f[0] not in partition_cols]
182
+ )
183
+ partition_filters = _empty_to_none(
184
+ [f for f in partition_hyper_slice if f[0] in partition_cols]
185
+ )
186
+ post_filters = [
187
+ f
188
+ for f in hyper_slice
189
+ if f[0] in partition_cols and f not in partition_hyper_slice
190
+ ]
191
+
192
+ pyarrow_table_existing_data = delta_table.to_pyarrow_table(
193
+ columns=columns,
194
+ partitions=partition_filters,
195
+ filters=file_filters,
196
+ )
183
197
 
184
- return lazy_frame.collect()
198
+ df = pl.from_arrow(pyarrow_table_existing_data)
199
+ for filter_ in post_filters:
200
+ df = df.filter(_filter_to_polars_expr(filter_))
201
+ return df
185
202
 
186
203
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
187
204
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev9"
3
+ version = "6.0.1.dev10"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"