datazone-sdk 6.0.1.dev10__tar.gz → 6.0.1.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/table.py +22 -53
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/README.md +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/testing/database_client.py +0 -0
|
@@ -12,12 +12,12 @@ from .slicing import HyperSlice
|
|
|
12
12
|
def _sql_literal(value: Any) -> str:
|
|
13
13
|
"""Render a Python value as a *type-correct* SQL literal for Delta predicates.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
datafusion type-checks predicates and does not always coerce a quoted
|
|
16
|
+
string literal to the column type (in particular not inside ``IN (...)``,
|
|
17
|
+
where a string compared to a timestamp column raises
|
|
18
|
+
``Timestamp(us, "UTC") <= Utf8View``). Rendering each value with its real
|
|
19
|
+
type avoids relying on that coercion: numbers are left unquoted, booleans
|
|
20
|
+
become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
|
|
21
21
|
strings/dates are quoted (and escaped).
|
|
22
22
|
"""
|
|
23
23
|
if isinstance(value, dt.datetime):
|
|
@@ -38,8 +38,16 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
38
38
|
"""Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
|
|
39
39
|
|
|
40
40
|
Needed because delta-rs overwrite/delete operations accept a SQL predicate
|
|
41
|
-
string, not the tuple filter format the SDK uses everywhere else.
|
|
42
|
-
|
|
41
|
+
string, not the tuple filter format the SDK uses everywhere else.
|
|
42
|
+
|
|
43
|
+
``in`` is rendered as an OR-of-equalities rather than SQL ``IN (...)`` on
|
|
44
|
+
purpose: datafusion coerces a quoted string literal to the column type for
|
|
45
|
+
the binary ``=`` operator, but NOT inside ``IN (...)``. So a timestamp
|
|
46
|
+
filter written as ``time_utc IN ('2025-01-01T...')`` raises
|
|
47
|
+
``Invalid comparison operation: Timestamp(us, "UTC") <= Utf8View``, while
|
|
48
|
+
``(time_utc = '2025-01-01T...')`` is coerced and works. (For columns whose
|
|
49
|
+
type already matches the literal, e.g. ints, ``IN`` would be fine -- but the
|
|
50
|
+
OR form is correct for every column type.)
|
|
43
51
|
"""
|
|
44
52
|
if len(dnf) == 0:
|
|
45
53
|
return "1=1"
|
|
@@ -71,28 +79,6 @@ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
|
71
79
|
return filters if len(filters) > 0 else None
|
|
72
80
|
|
|
73
81
|
|
|
74
|
-
def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
75
|
-
"""Convert a single tuple filter to a Polars expression.
|
|
76
|
-
|
|
77
|
-
Used to apply residual partition predicates as a Polars post-filter in
|
|
78
|
-
``Table.read`` after delta-rs partition pruning has narrowed the files read.
|
|
79
|
-
"""
|
|
80
|
-
col, op, val = filter_
|
|
81
|
-
if op == "=":
|
|
82
|
-
return pl.col(col) == val
|
|
83
|
-
if op == "in":
|
|
84
|
-
return pl.col(col).is_in(val)
|
|
85
|
-
if op == ">=":
|
|
86
|
-
return pl.col(col) >= val
|
|
87
|
-
if op == "<=":
|
|
88
|
-
return pl.col(col) <= val
|
|
89
|
-
if op == ">":
|
|
90
|
-
return pl.col(col) > val
|
|
91
|
-
if op == "<":
|
|
92
|
-
return pl.col(col) < val
|
|
93
|
-
raise ValueError(f"Unsupported operation: {op}")
|
|
94
|
-
|
|
95
|
-
|
|
96
82
|
class Table:
|
|
97
83
|
def __init__(
|
|
98
84
|
self,
|
|
@@ -146,7 +132,7 @@ class Table:
|
|
|
146
132
|
) -> pl.DataFrame:
|
|
147
133
|
"""Read from Delta table.
|
|
148
134
|
|
|
149
|
-
Filters are split into
|
|
135
|
+
Filters are split into two groups so each is handled by the engine
|
|
150
136
|
best suited for it:
|
|
151
137
|
|
|
152
138
|
* Partition filters (including filters derived from generated partition
|
|
@@ -154,8 +140,6 @@ class Table:
|
|
|
154
140
|
before any data is read.
|
|
155
141
|
* Non-partition filters go to ``filters=`` and are pushed down to the
|
|
156
142
|
Parquet reader.
|
|
157
|
-
* Residual partition predicates that cannot be expressed as a partition
|
|
158
|
-
filter are applied afterwards as a Polars post-filter.
|
|
159
143
|
|
|
160
144
|
Args:
|
|
161
145
|
hyper_slice (HyperSlice): Hyper slice used to filter data.
|
|
@@ -164,10 +148,8 @@ class Table:
|
|
|
164
148
|
if hyper_slice is None:
|
|
165
149
|
hyper_slice = []
|
|
166
150
|
|
|
167
|
-
#
|
|
168
|
-
|
|
169
|
-
# represented differently by PyArrow than by Delta's partition pruning.
|
|
170
|
-
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
151
|
+
# add generated filters to hyperslice
|
|
152
|
+
hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
171
153
|
|
|
172
154
|
delta_table = self.delta_table
|
|
173
155
|
partition_cols = delta_table.metadata().partition_columns
|
|
@@ -175,19 +157,9 @@ class Table:
|
|
|
175
157
|
if len(hyper_slice) == 0:
|
|
176
158
|
file_filters = None
|
|
177
159
|
partition_filters = None
|
|
178
|
-
post_filters = []
|
|
179
160
|
else:
|
|
180
|
-
file_filters =
|
|
181
|
-
|
|
182
|
-
)
|
|
183
|
-
partition_filters = _empty_to_none(
|
|
184
|
-
[f for f in partition_hyper_slice if f[0] in partition_cols]
|
|
185
|
-
)
|
|
186
|
-
post_filters = [
|
|
187
|
-
f
|
|
188
|
-
for f in hyper_slice
|
|
189
|
-
if f[0] in partition_cols and f not in partition_hyper_slice
|
|
190
|
-
]
|
|
161
|
+
file_filters = hyper_slice
|
|
162
|
+
partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
|
|
191
163
|
|
|
192
164
|
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
193
165
|
columns=columns,
|
|
@@ -195,10 +167,7 @@ class Table:
|
|
|
195
167
|
filters=file_filters,
|
|
196
168
|
)
|
|
197
169
|
|
|
198
|
-
|
|
199
|
-
for filter_ in post_filters:
|
|
200
|
-
df = df.filter(_filter_to_polars_expr(filter_))
|
|
201
|
-
return df
|
|
170
|
+
return pl.from_arrow(pyarrow_table_existing_data)
|
|
202
171
|
|
|
203
172
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
204
173
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev11"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|