datazone-sdk 6.0.1.dev9__tar.gz → 6.0.1.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/table.py +46 -60
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/README.md +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/testing/database_client.py +0 -0
|
@@ -12,12 +12,12 @@ from .slicing import HyperSlice
|
|
|
12
12
|
def _sql_literal(value: Any) -> str:
|
|
13
13
|
"""Render a Python value as a *type-correct* SQL literal for Delta predicates.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
datafusion type-checks predicates and does not always coerce a quoted
|
|
16
|
+
string literal to the column type (in particular not inside ``IN (...)``,
|
|
17
|
+
where a string compared to a timestamp column raises
|
|
18
|
+
``Timestamp(us, "UTC") <= Utf8View``). Rendering each value with its real
|
|
19
|
+
type avoids relying on that coercion: numbers are left unquoted, booleans
|
|
20
|
+
become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
|
|
21
21
|
strings/dates are quoted (and escaped).
|
|
22
22
|
"""
|
|
23
23
|
if isinstance(value, dt.datetime):
|
|
@@ -38,8 +38,16 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
38
38
|
"""Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
|
|
39
39
|
|
|
40
40
|
Needed because delta-rs overwrite/delete operations accept a SQL predicate
|
|
41
|
-
string, not the tuple filter format the SDK uses everywhere else.
|
|
42
|
-
|
|
41
|
+
string, not the tuple filter format the SDK uses everywhere else.
|
|
42
|
+
|
|
43
|
+
``in`` is rendered as an OR-of-equalities rather than SQL ``IN (...)`` on
|
|
44
|
+
purpose: datafusion coerces a quoted string literal to the column type for
|
|
45
|
+
the binary ``=`` operator, but NOT inside ``IN (...)``. So a timestamp
|
|
46
|
+
filter written as ``time_utc IN ('2025-01-01T...')`` raises
|
|
47
|
+
``Invalid comparison operation: Timestamp(us, "UTC") <= Utf8View``, while
|
|
48
|
+
``(time_utc = '2025-01-01T...')`` is coerced and works. (For columns whose
|
|
49
|
+
type already matches the literal, e.g. ints, ``IN`` would be fine -- but the
|
|
50
|
+
OR form is correct for every column type.)
|
|
43
51
|
"""
|
|
44
52
|
if len(dnf) == 0:
|
|
45
53
|
return "1=1"
|
|
@@ -61,28 +69,14 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
61
69
|
return " AND ".join(sql_parts)
|
|
62
70
|
|
|
63
71
|
|
|
64
|
-
def
|
|
65
|
-
"""
|
|
72
|
+
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
73
|
+
"""Return ``None`` for an empty filter list.
|
|
66
74
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
columns (``ArrowNotImplementedError``).
|
|
75
|
+
``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
|
|
76
|
+
must pass ``None`` rather than an empty list when there is nothing to filter
|
|
77
|
+
on.
|
|
71
78
|
"""
|
|
72
|
-
|
|
73
|
-
if op == "=":
|
|
74
|
-
return pl.col(col) == val
|
|
75
|
-
if op == "in":
|
|
76
|
-
return pl.col(col).is_in(val)
|
|
77
|
-
if op == ">=":
|
|
78
|
-
return pl.col(col) >= val
|
|
79
|
-
if op == "<=":
|
|
80
|
-
return pl.col(col) <= val
|
|
81
|
-
if op == ">":
|
|
82
|
-
return pl.col(col) > val
|
|
83
|
-
if op == "<":
|
|
84
|
-
return pl.col(col) < val
|
|
85
|
-
raise ValueError(f"Unsupported operation: {op}")
|
|
79
|
+
return filters if len(filters) > 0 else None
|
|
86
80
|
|
|
87
81
|
|
|
88
82
|
class Table:
|
|
@@ -138,19 +132,14 @@ class Table:
|
|
|
138
132
|
) -> pl.DataFrame:
|
|
139
133
|
"""Read from Delta table.
|
|
140
134
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
result in partition pruning, while non-partition filters are pushed
|
|
144
|
-
down to the Parquet reader as predicates.
|
|
135
|
+
Filters are split into two groups so each is handled by the engine
|
|
136
|
+
best suited for it:
|
|
145
137
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
`ArrowNotImplementedError: Function 'greater_equal' has no kernel
|
|
152
|
-
matching input types (string_view, string_view)`. The native reader
|
|
153
|
-
evaluates predicates in its own engine and avoids this entirely.
|
|
138
|
+
* Partition filters (including filters derived from generated partition
|
|
139
|
+
columns) go to ``partitions=`` so delta-rs prunes whole partitions
|
|
140
|
+
before any data is read.
|
|
141
|
+
* Non-partition filters go to ``filters=`` and are pushed down to the
|
|
142
|
+
Parquet reader.
|
|
154
143
|
|
|
155
144
|
Args:
|
|
156
145
|
hyper_slice (HyperSlice): Hyper slice used to filter data.
|
|
@@ -159,29 +148,26 @@ class Table:
|
|
|
159
148
|
if hyper_slice is None:
|
|
160
149
|
hyper_slice = []
|
|
161
150
|
|
|
162
|
-
#
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# `credential_provider=None` makes Polars pass `storage_options` straight
|
|
168
|
-
# to the native object store, exactly like `dl.DeltaTable(...)` does. We
|
|
169
|
-
# avoid the default `credential_provider="auto"`, which would build a
|
|
170
|
-
# separate Polars-managed Azure credential and could diverge from the
|
|
171
|
-
# managed-identity / Azure CLI auth used everywhere else in this class.
|
|
172
|
-
lazy_frame = pl.scan_delta(
|
|
173
|
-
self.url,
|
|
174
|
-
storage_options=self.storage_options,
|
|
175
|
-
credential_provider=None,
|
|
176
|
-
)
|
|
151
|
+
# add generated filters to hyperslice
|
|
152
|
+
hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
153
|
+
|
|
154
|
+
delta_table = self.delta_table
|
|
155
|
+
partition_cols = delta_table.metadata().partition_columns
|
|
177
156
|
|
|
178
|
-
|
|
179
|
-
|
|
157
|
+
if len(hyper_slice) == 0:
|
|
158
|
+
file_filters = None
|
|
159
|
+
partition_filters = None
|
|
160
|
+
else:
|
|
161
|
+
file_filters = hyper_slice
|
|
162
|
+
partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
|
|
180
163
|
|
|
181
|
-
|
|
182
|
-
|
|
164
|
+
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
165
|
+
columns=columns,
|
|
166
|
+
partitions=partition_filters,
|
|
167
|
+
filters=file_filters,
|
|
168
|
+
)
|
|
183
169
|
|
|
184
|
-
return
|
|
170
|
+
return pl.from_arrow(pyarrow_table_existing_data)
|
|
185
171
|
|
|
186
172
|
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
187
173
|
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev11"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev9 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|