datazone-sdk 6.0.1.dev4__tar.gz → 6.0.1.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/table.py +51 -5
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/README.md +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/testing/database_client.py +0 -0
|
@@ -52,6 +52,31 @@ def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
|
|
|
52
52
|
return "arrow_cast should have been simplified to cast" in str(error)
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def _is_safe_partition_filter(filter_: tuple) -> bool:
|
|
56
|
+
return filter_[1] in ["=", "in"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
60
|
+
return filters if len(filters) > 0 else None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
64
|
+
col, op, val = filter_
|
|
65
|
+
if op == "=":
|
|
66
|
+
return pl.col(col) == val
|
|
67
|
+
if op == "in":
|
|
68
|
+
return pl.col(col).is_in(val)
|
|
69
|
+
if op == ">=":
|
|
70
|
+
return pl.col(col) >= val
|
|
71
|
+
if op == "<=":
|
|
72
|
+
return pl.col(col) <= val
|
|
73
|
+
if op == ">":
|
|
74
|
+
return pl.col(col) > val
|
|
75
|
+
if op == "<":
|
|
76
|
+
return pl.col(col) < val
|
|
77
|
+
raise ValueError(f"Unsupported operation: {op}")
|
|
78
|
+
|
|
79
|
+
|
|
55
80
|
class DeltaOverwriteFallbackWarning(RuntimeWarning):
|
|
56
81
|
pass
|
|
57
82
|
|
|
@@ -115,8 +140,10 @@ class Table:
|
|
|
115
140
|
if hyper_slice is None:
|
|
116
141
|
hyper_slice = []
|
|
117
142
|
|
|
118
|
-
#
|
|
119
|
-
|
|
143
|
+
# Generated filters are an optimization for partition pruning. Keep them out
|
|
144
|
+
# of row filters because partition values may be represented as strings by
|
|
145
|
+
# PyArrow even when the Delta schema has a richer logical type.
|
|
146
|
+
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
120
147
|
|
|
121
148
|
delta_table = self.delta_table
|
|
122
149
|
partition_cols = delta_table.metadata().partition_columns
|
|
@@ -124,9 +151,23 @@ class Table:
|
|
|
124
151
|
if len(hyper_slice) == 0:
|
|
125
152
|
file_filters = None
|
|
126
153
|
partition_filters = None
|
|
154
|
+
post_filters = []
|
|
127
155
|
else:
|
|
128
|
-
file_filters =
|
|
129
|
-
|
|
156
|
+
file_filters = _empty_to_none(
|
|
157
|
+
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
158
|
+
)
|
|
159
|
+
partition_filters = _empty_to_none(
|
|
160
|
+
[
|
|
161
|
+
f
|
|
162
|
+
for f in partition_hyper_slice
|
|
163
|
+
if f[0] in partition_cols and _is_safe_partition_filter(f)
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
post_filters = [
|
|
167
|
+
f
|
|
168
|
+
for f in hyper_slice
|
|
169
|
+
if f[0] in partition_cols and not _is_safe_partition_filter(f)
|
|
170
|
+
]
|
|
130
171
|
|
|
131
172
|
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
132
173
|
columns=columns,
|
|
@@ -134,7 +175,10 @@ class Table:
|
|
|
134
175
|
filters=file_filters,
|
|
135
176
|
)
|
|
136
177
|
|
|
137
|
-
|
|
178
|
+
df = pl.from_arrow(pyarrow_table_existing_data)
|
|
179
|
+
for filter_ in post_filters:
|
|
180
|
+
df = df.filter(_filter_to_polars_expr(filter_))
|
|
181
|
+
return df
|
|
138
182
|
|
|
139
183
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
140
184
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -188,6 +232,8 @@ class Table:
|
|
|
188
232
|
schema = self.schema()
|
|
189
233
|
data = self._to_writable_pyarrow_table(df=df, schema=schema)
|
|
190
234
|
|
|
235
|
+
hyper_slice = schema.add_generated_filters(hyper_slice)
|
|
236
|
+
|
|
191
237
|
if len(hyper_slice) == 0:
|
|
192
238
|
predicate = None
|
|
193
239
|
else:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev5"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|