datazone-sdk 6.0.1.dev9__tar.gz → 6.0.1.dev10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/table.py +53 -36
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/README.md +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/testing/database_client.py +0 -0
|
@@ -61,13 +61,21 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
61
61
|
return " AND ".join(sql_parts)
|
|
62
62
|
|
|
63
63
|
|
|
64
|
+
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
65
|
+
"""Return ``None`` for an empty filter list.
|
|
66
|
+
|
|
67
|
+
``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
|
|
68
|
+
must pass ``None`` rather than an empty list when there is nothing to filter
|
|
69
|
+
on.
|
|
70
|
+
"""
|
|
71
|
+
return filters if len(filters) > 0 else None
|
|
72
|
+
|
|
73
|
+
|
|
64
74
|
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
65
75
|
"""Convert a single tuple filter to a Polars expression.
|
|
66
76
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
PyArrow filters, whose compute kernels fail on delta-rs ``string_view``
|
|
70
|
-
columns (``ArrowNotImplementedError``).
|
|
77
|
+
Used to apply residual partition predicates as a Polars post-filter in
|
|
78
|
+
``Table.read`` after delta-rs partition pruning has narrowed the files read.
|
|
71
79
|
"""
|
|
72
80
|
col, op, val = filter_
|
|
73
81
|
if op == "=":
|
|
@@ -138,19 +146,16 @@ class Table:
|
|
|
138
146
|
) -> pl.DataFrame:
|
|
139
147
|
"""Read from Delta table.
|
|
140
148
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
result in partition pruning, while non-partition filters are pushed
|
|
144
|
-
down to the Parquet reader as predicates.
|
|
149
|
+
Filters are split into three groups so each is handled by the engine
|
|
150
|
+
best suited for it:
|
|
145
151
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
evaluates predicates in its own engine and avoids this entirely.
|
|
152
|
+
* Partition filters (including filters derived from generated partition
|
|
153
|
+
columns) go to ``partitions=`` so delta-rs prunes whole partitions
|
|
154
|
+
before any data is read.
|
|
155
|
+
* Non-partition filters go to ``filters=`` and are pushed down to the
|
|
156
|
+
Parquet reader.
|
|
157
|
+
* Residual partition predicates that cannot be expressed as a partition
|
|
158
|
+
filter are applied afterwards as a Polars post-filter.
|
|
154
159
|
|
|
155
160
|
Args:
|
|
156
161
|
hyper_slice (HyperSlice): Hyper slice used to filter data.
|
|
@@ -159,29 +164,41 @@ class Table:
|
|
|
159
164
|
if hyper_slice is None:
|
|
160
165
|
hyper_slice = []
|
|
161
166
|
|
|
162
|
-
# Generated filters
|
|
163
|
-
#
|
|
164
|
-
#
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# `credential_provider=None` makes Polars pass `storage_options` straight
|
|
168
|
-
# to the native object store, exactly like `dl.DeltaTable(...)` does. We
|
|
169
|
-
# avoid the default `credential_provider="auto"`, which would build a
|
|
170
|
-
# separate Polars-managed Azure credential and could diverge from the
|
|
171
|
-
# managed-identity / Azure CLI auth used everywhere else in this class.
|
|
172
|
-
lazy_frame = pl.scan_delta(
|
|
173
|
-
self.url,
|
|
174
|
-
storage_options=self.storage_options,
|
|
175
|
-
credential_provider=None,
|
|
176
|
-
)
|
|
167
|
+
# Generated filters are an optimization for partition pruning. Keep all
|
|
168
|
+
# partition filters out of row filters because partition values may be
|
|
169
|
+
# represented differently by PyArrow than by Delta's partition pruning.
|
|
170
|
+
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
177
171
|
|
|
178
|
-
|
|
179
|
-
|
|
172
|
+
delta_table = self.delta_table
|
|
173
|
+
partition_cols = delta_table.metadata().partition_columns
|
|
180
174
|
|
|
181
|
-
if
|
|
182
|
-
|
|
175
|
+
if len(hyper_slice) == 0:
|
|
176
|
+
file_filters = None
|
|
177
|
+
partition_filters = None
|
|
178
|
+
post_filters = []
|
|
179
|
+
else:
|
|
180
|
+
file_filters = _empty_to_none(
|
|
181
|
+
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
182
|
+
)
|
|
183
|
+
partition_filters = _empty_to_none(
|
|
184
|
+
[f for f in partition_hyper_slice if f[0] in partition_cols]
|
|
185
|
+
)
|
|
186
|
+
post_filters = [
|
|
187
|
+
f
|
|
188
|
+
for f in hyper_slice
|
|
189
|
+
if f[0] in partition_cols and f not in partition_hyper_slice
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
193
|
+
columns=columns,
|
|
194
|
+
partitions=partition_filters,
|
|
195
|
+
filters=file_filters,
|
|
196
|
+
)
|
|
183
197
|
|
|
184
|
-
|
|
198
|
+
df = pl.from_arrow(pyarrow_table_existing_data)
|
|
199
|
+
for filter_ in post_filters:
|
|
200
|
+
df = df.filter(_filter_to_polars_expr(filter_))
|
|
201
|
+
return df
|
|
185
202
|
|
|
186
203
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
187
204
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev10"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|