datazone-sdk 6.0.1.dev4__tar.gz → 6.0.1.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/__init__.py +1 -8
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/__init__.py +1 -1
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/table.py +48 -26
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/README.md +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/testing/database_client.py +0 -0
|
@@ -1,13 +1,6 @@
|
|
|
1
1
|
from datazone.backtesting import backtest
|
|
2
2
|
from datazone.caching import ParquetCache
|
|
3
3
|
from datazone.db import DatabaseClient, SnapshotDatabaseClient
|
|
4
|
-
from datazone.deltastorage import
|
|
5
|
-
DeltaOverwriteFallbackWarning,
|
|
6
|
-
Field,
|
|
7
|
-
HyperSlice,
|
|
8
|
-
Schema,
|
|
9
|
-
Store,
|
|
10
|
-
Table,
|
|
11
|
-
)
|
|
4
|
+
from datazone.deltastorage import Field, HyperSlice, Schema, Store, Table
|
|
12
5
|
|
|
13
6
|
from . import testing
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import datetime as dt
|
|
2
|
-
import warnings
|
|
3
2
|
from typing import Any, Optional
|
|
4
3
|
|
|
5
4
|
import deltalake as dl
|
|
6
5
|
import polars as pl
|
|
7
6
|
import pyarrow as pa
|
|
8
|
-
from deltalake.exceptions import CommitFailedError
|
|
9
7
|
|
|
10
8
|
from .schema import Schema
|
|
11
9
|
from .slicing import HyperSlice
|
|
@@ -48,12 +46,29 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
48
46
|
return " AND ".join(sql_parts)
|
|
49
47
|
|
|
50
48
|
|
|
51
|
-
def
|
|
52
|
-
return
|
|
49
|
+
def _is_safe_partition_filter(filter_: tuple) -> bool:
|
|
50
|
+
return filter_[1] in ["=", "in"]
|
|
53
51
|
|
|
54
52
|
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
54
|
+
return filters if len(filters) > 0 else None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
58
|
+
col, op, val = filter_
|
|
59
|
+
if op == "=":
|
|
60
|
+
return pl.col(col) == val
|
|
61
|
+
if op == "in":
|
|
62
|
+
return pl.col(col).is_in(val)
|
|
63
|
+
if op == ">=":
|
|
64
|
+
return pl.col(col) >= val
|
|
65
|
+
if op == "<=":
|
|
66
|
+
return pl.col(col) <= val
|
|
67
|
+
if op == ">":
|
|
68
|
+
return pl.col(col) > val
|
|
69
|
+
if op == "<":
|
|
70
|
+
return pl.col(col) < val
|
|
71
|
+
raise ValueError(f"Unsupported operation: {op}")
|
|
57
72
|
|
|
58
73
|
|
|
59
74
|
class Table:
|
|
@@ -115,8 +130,10 @@ class Table:
|
|
|
115
130
|
if hyper_slice is None:
|
|
116
131
|
hyper_slice = []
|
|
117
132
|
|
|
118
|
-
#
|
|
119
|
-
|
|
133
|
+
# Generated filters are an optimization for partition pruning. Keep them out
|
|
134
|
+
# of row filters because partition values may be represented as strings by
|
|
135
|
+
# PyArrow even when the Delta schema has a richer logical type.
|
|
136
|
+
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
120
137
|
|
|
121
138
|
delta_table = self.delta_table
|
|
122
139
|
partition_cols = delta_table.metadata().partition_columns
|
|
@@ -124,9 +141,23 @@ class Table:
|
|
|
124
141
|
if len(hyper_slice) == 0:
|
|
125
142
|
file_filters = None
|
|
126
143
|
partition_filters = None
|
|
144
|
+
post_filters = []
|
|
127
145
|
else:
|
|
128
|
-
file_filters =
|
|
129
|
-
|
|
146
|
+
file_filters = _empty_to_none(
|
|
147
|
+
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
148
|
+
)
|
|
149
|
+
partition_filters = _empty_to_none(
|
|
150
|
+
[
|
|
151
|
+
f
|
|
152
|
+
for f in partition_hyper_slice
|
|
153
|
+
if f[0] in partition_cols and _is_safe_partition_filter(f)
|
|
154
|
+
]
|
|
155
|
+
)
|
|
156
|
+
post_filters = [
|
|
157
|
+
f
|
|
158
|
+
for f in hyper_slice
|
|
159
|
+
if f[0] in partition_cols and not _is_safe_partition_filter(f)
|
|
160
|
+
]
|
|
130
161
|
|
|
131
162
|
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
132
163
|
columns=columns,
|
|
@@ -134,7 +165,10 @@ class Table:
|
|
|
134
165
|
filters=file_filters,
|
|
135
166
|
)
|
|
136
167
|
|
|
137
|
-
|
|
168
|
+
df = pl.from_arrow(pyarrow_table_existing_data)
|
|
169
|
+
for filter_ in post_filters:
|
|
170
|
+
df = df.filter(_filter_to_polars_expr(filter_))
|
|
171
|
+
return df
|
|
138
172
|
|
|
139
173
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
140
174
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -188,26 +222,14 @@ class Table:
|
|
|
188
222
|
schema = self.schema()
|
|
189
223
|
data = self._to_writable_pyarrow_table(df=df, schema=schema)
|
|
190
224
|
|
|
225
|
+
hyper_slice = schema.add_generated_filters(hyper_slice)
|
|
226
|
+
|
|
191
227
|
if len(hyper_slice) == 0:
|
|
192
228
|
predicate = None
|
|
193
229
|
else:
|
|
194
230
|
predicate = _dnf_to_sql(hyper_slice)
|
|
195
231
|
|
|
196
|
-
|
|
197
|
-
self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
|
|
198
|
-
except CommitFailedError as error:
|
|
199
|
-
if predicate is None or not _is_arrow_cast_commit_failure(error):
|
|
200
|
-
raise
|
|
201
|
-
|
|
202
|
-
warnings.warn(
|
|
203
|
-
"Delta predicate overwrite failed with the DataFusion arrow_cast bug. "
|
|
204
|
-
"Falling back to delete followed by append; this is not atomic as a "
|
|
205
|
-
"single overwrite commit.",
|
|
206
|
-
DeltaOverwriteFallbackWarning,
|
|
207
|
-
stacklevel=2,
|
|
208
|
-
)
|
|
209
|
-
self.delta_table.delete(predicate)
|
|
210
|
-
self._write_deltalake(data=data, mode="append", predicate=None)
|
|
232
|
+
self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
|
|
211
233
|
|
|
212
234
|
def append(self, df: pl.DataFrame) -> None:
|
|
213
235
|
"""Append rows to Delta Lake. This will write data to the Delta Lake.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev6"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|