datazone-sdk 6.0.1.dev6__tar.gz → 6.0.1.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/table.py +40 -46
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/README.md +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/testing/database_client.py +0 -0
|
@@ -46,14 +46,6 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
46
46
|
return " AND ".join(sql_parts)
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def _is_safe_partition_filter(filter_: tuple) -> bool:
|
|
50
|
-
return filter_[1] in ["=", "in"]
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
54
|
-
return filters if len(filters) > 0 else None
|
|
55
|
-
|
|
56
|
-
|
|
57
49
|
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
58
50
|
col, op, val = filter_
|
|
59
51
|
if op == "=":
|
|
@@ -122,53 +114,55 @@ class Table:
|
|
|
122
114
|
def read(
|
|
123
115
|
self, hyper_slice: Optional[HyperSlice] = None, columns=None
|
|
124
116
|
) -> pl.DataFrame:
|
|
125
|
-
"""Read from Delta table
|
|
117
|
+
"""Read from Delta table.
|
|
118
|
+
|
|
119
|
+
All filters are pushed down to the native Delta reader. Partition
|
|
120
|
+
filters (including filters derived from generated partition columns)
|
|
121
|
+
result in partition pruning, while non-partition filters are pushed
|
|
122
|
+
down to the Parquet reader as predicates.
|
|
123
|
+
|
|
124
|
+
We use the native Polars Delta reader (`pl.scan_delta`) rather than
|
|
125
|
+
`DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
|
|
126
|
+
filter path evaluates predicates with PyArrow compute kernels, which
|
|
127
|
+
do not implement comparisons for the `string_view` type returned by
|
|
128
|
+
delta-rs. That mismatch raises errors such as
|
|
129
|
+
`ArrowNotImplementedError: Function 'greater_equal' has no kernel
|
|
130
|
+
matching input types (string_view, string_view)`. The native reader
|
|
131
|
+
evaluates predicates in its own engine and avoids this entirely.
|
|
132
|
+
|
|
133
|
+
Any further (post-)filtering that cannot be expressed as a pushdown
|
|
134
|
+
predicate is the responsibility of the caller.
|
|
126
135
|
|
|
127
136
|
Args:
|
|
128
|
-
hyper_slice (HyperSlice): Hyper
|
|
137
|
+
hyper_slice (HyperSlice): Hyper slice used to filter data.
|
|
138
|
+
columns: Optional list of columns to project.
|
|
129
139
|
"""
|
|
130
140
|
if hyper_slice is None:
|
|
131
141
|
hyper_slice = []
|
|
132
142
|
|
|
133
|
-
# Generated filters
|
|
134
|
-
#
|
|
135
|
-
#
|
|
136
|
-
|
|
143
|
+
# Generated filters add predicates on generated partition columns
|
|
144
|
+
# (e.g. `date_utc` derived from `time_utc`) so the native reader can
|
|
145
|
+
# prune partitions even when the caller only filters the base column.
|
|
146
|
+
pushdown_slice = self.schema().add_generated_filters(hyper_slice)
|
|
147
|
+
|
|
148
|
+
# `credential_provider=None` makes Polars pass `storage_options` straight
|
|
149
|
+
# to the native object store, exactly like `dl.DeltaTable(...)` does. We
|
|
150
|
+
# avoid the default `credential_provider="auto"`, which would build a
|
|
151
|
+
# separate Polars-managed Azure credential and could diverge from the
|
|
152
|
+
# managed-identity / Azure CLI auth used everywhere else in this class.
|
|
153
|
+
lazy_frame = pl.scan_delta(
|
|
154
|
+
self.url,
|
|
155
|
+
storage_options=self.storage_options,
|
|
156
|
+
credential_provider=None,
|
|
157
|
+
)
|
|
137
158
|
|
|
138
|
-
|
|
139
|
-
|
|
159
|
+
for filter_ in pushdown_slice:
|
|
160
|
+
lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
|
|
140
161
|
|
|
141
|
-
if
|
|
142
|
-
|
|
143
|
-
partition_filters = None
|
|
144
|
-
post_filters = []
|
|
145
|
-
else:
|
|
146
|
-
file_filters = _empty_to_none(
|
|
147
|
-
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
148
|
-
)
|
|
149
|
-
partition_filters = _empty_to_none(
|
|
150
|
-
[
|
|
151
|
-
f
|
|
152
|
-
for f in partition_hyper_slice
|
|
153
|
-
if f[0] in partition_cols and _is_safe_partition_filter(f)
|
|
154
|
-
]
|
|
155
|
-
)
|
|
156
|
-
post_filters = [
|
|
157
|
-
f
|
|
158
|
-
for f in hyper_slice
|
|
159
|
-
if f[0] in partition_cols and not _is_safe_partition_filter(f)
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
163
|
-
columns=columns,
|
|
164
|
-
partitions=partition_filters,
|
|
165
|
-
filters=file_filters,
|
|
166
|
-
)
|
|
162
|
+
if columns is not None:
|
|
163
|
+
lazy_frame = lazy_frame.select(columns)
|
|
167
164
|
|
|
168
|
-
|
|
169
|
-
for filter_ in post_filters:
|
|
170
|
-
df = df.filter(_filter_to_polars_expr(filter_))
|
|
171
|
-
return df
|
|
165
|
+
return lazy_frame.collect()
|
|
172
166
|
|
|
173
167
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
174
168
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev8"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|