datazone-sdk 6.0.1.dev3__tar.gz → 6.0.1.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/__init__.py +8 -1
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/__init__.py +1 -1
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/table.py +63 -5
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/README.md +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/testing/database_client.py +0 -0
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from datazone.backtesting import backtest
|
|
2
2
|
from datazone.caching import ParquetCache
|
|
3
3
|
from datazone.db import DatabaseClient, SnapshotDatabaseClient
|
|
4
|
-
from datazone.deltastorage import
|
|
4
|
+
from datazone.deltastorage import (
|
|
5
|
+
DeltaOverwriteFallbackWarning,
|
|
6
|
+
Field,
|
|
7
|
+
HyperSlice,
|
|
8
|
+
Schema,
|
|
9
|
+
Store,
|
|
10
|
+
Table,
|
|
11
|
+
)
|
|
5
12
|
|
|
6
13
|
from . import testing
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import datetime as dt
|
|
2
|
+
import warnings
|
|
2
3
|
from typing import Any, Optional
|
|
3
4
|
|
|
4
5
|
import deltalake as dl
|
|
@@ -51,6 +52,35 @@ def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
|
|
|
51
52
|
return "arrow_cast should have been simplified to cast" in str(error)
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
def _is_safe_partition_filter(filter_: tuple) -> bool:
|
|
56
|
+
return filter_[1] in ["=", "in"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
60
|
+
return filters if len(filters) > 0 else None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
64
|
+
col, op, val = filter_
|
|
65
|
+
if op == "=":
|
|
66
|
+
return pl.col(col) == val
|
|
67
|
+
if op == "in":
|
|
68
|
+
return pl.col(col).is_in(val)
|
|
69
|
+
if op == ">=":
|
|
70
|
+
return pl.col(col) >= val
|
|
71
|
+
if op == "<=":
|
|
72
|
+
return pl.col(col) <= val
|
|
73
|
+
if op == ">":
|
|
74
|
+
return pl.col(col) > val
|
|
75
|
+
if op == "<":
|
|
76
|
+
return pl.col(col) < val
|
|
77
|
+
raise ValueError(f"Unsupported operation: {op}")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DeltaOverwriteFallbackWarning(RuntimeWarning):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
54
84
|
class Table:
|
|
55
85
|
def __init__(
|
|
56
86
|
self,
|
|
@@ -110,8 +140,10 @@ class Table:
|
|
|
110
140
|
if hyper_slice is None:
|
|
111
141
|
hyper_slice = []
|
|
112
142
|
|
|
113
|
-
#
|
|
114
|
-
|
|
143
|
+
# Generated filters are an optimization for partition pruning. Keep them out
|
|
144
|
+
# of row filters because partition values may be represented as strings by
|
|
145
|
+
# PyArrow even when the Delta schema has a richer logical type.
|
|
146
|
+
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
115
147
|
|
|
116
148
|
delta_table = self.delta_table
|
|
117
149
|
partition_cols = delta_table.metadata().partition_columns
|
|
@@ -119,9 +151,23 @@ class Table:
|
|
|
119
151
|
if len(hyper_slice) == 0:
|
|
120
152
|
file_filters = None
|
|
121
153
|
partition_filters = None
|
|
154
|
+
post_filters = []
|
|
122
155
|
else:
|
|
123
|
-
file_filters =
|
|
124
|
-
|
|
156
|
+
file_filters = _empty_to_none(
|
|
157
|
+
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
158
|
+
)
|
|
159
|
+
partition_filters = _empty_to_none(
|
|
160
|
+
[
|
|
161
|
+
f
|
|
162
|
+
for f in partition_hyper_slice
|
|
163
|
+
if f[0] in partition_cols and _is_safe_partition_filter(f)
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
post_filters = [
|
|
167
|
+
f
|
|
168
|
+
for f in hyper_slice
|
|
169
|
+
if f[0] in partition_cols and not _is_safe_partition_filter(f)
|
|
170
|
+
]
|
|
125
171
|
|
|
126
172
|
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
127
173
|
columns=columns,
|
|
@@ -129,7 +175,10 @@ class Table:
|
|
|
129
175
|
filters=file_filters,
|
|
130
176
|
)
|
|
131
177
|
|
|
132
|
-
|
|
178
|
+
df = pl.from_arrow(pyarrow_table_existing_data)
|
|
179
|
+
for filter_ in post_filters:
|
|
180
|
+
df = df.filter(_filter_to_polars_expr(filter_))
|
|
181
|
+
return df
|
|
133
182
|
|
|
134
183
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
135
184
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -183,6 +232,8 @@ class Table:
|
|
|
183
232
|
schema = self.schema()
|
|
184
233
|
data = self._to_writable_pyarrow_table(df=df, schema=schema)
|
|
185
234
|
|
|
235
|
+
hyper_slice = schema.add_generated_filters(hyper_slice)
|
|
236
|
+
|
|
186
237
|
if len(hyper_slice) == 0:
|
|
187
238
|
predicate = None
|
|
188
239
|
else:
|
|
@@ -194,6 +245,13 @@ class Table:
|
|
|
194
245
|
if predicate is None or not _is_arrow_cast_commit_failure(error):
|
|
195
246
|
raise
|
|
196
247
|
|
|
248
|
+
warnings.warn(
|
|
249
|
+
"Delta predicate overwrite failed with the DataFusion arrow_cast bug. "
|
|
250
|
+
"Falling back to delete followed by append; this is not atomic as a "
|
|
251
|
+
"single overwrite commit.",
|
|
252
|
+
DeltaOverwriteFallbackWarning,
|
|
253
|
+
stacklevel=2,
|
|
254
|
+
)
|
|
197
255
|
self.delta_table.delete(predicate)
|
|
198
256
|
self._write_deltalake(data=data, mode="append", predicate=None)
|
|
199
257
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev5"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|