datazone-sdk 6.0.1.dev5__tar.gz → 6.0.1.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/PKG-INFO +1 -1
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/__init__.py +1 -8
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/__init__.py +1 -1
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/table.py +6 -38
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/pyproject.toml +1 -1
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/README.md +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/backtesting.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/caching/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/caching/parquet.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/db/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/db/base.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/db/cached.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/db/snapshot.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/db/standard.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/data_types.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/generated_columns.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/schema.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/slicing.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/store.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/testing/__init__.py +0 -0
- {datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/testing/database_client.py +0 -0
|
@@ -1,13 +1,6 @@
|
|
|
1
1
|
from datazone.backtesting import backtest
|
|
2
2
|
from datazone.caching import ParquetCache
|
|
3
3
|
from datazone.db import DatabaseClient, SnapshotDatabaseClient
|
|
4
|
-
from datazone.deltastorage import
|
|
5
|
-
DeltaOverwriteFallbackWarning,
|
|
6
|
-
Field,
|
|
7
|
-
HyperSlice,
|
|
8
|
-
Schema,
|
|
9
|
-
Store,
|
|
10
|
-
Table,
|
|
11
|
-
)
|
|
4
|
+
from datazone.deltastorage import Field, HyperSlice, Schema, Store, Table
|
|
12
5
|
|
|
13
6
|
from . import testing
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import datetime as dt
|
|
2
|
-
import warnings
|
|
3
2
|
from typing import Any, Optional
|
|
4
3
|
|
|
5
4
|
import deltalake as dl
|
|
6
5
|
import polars as pl
|
|
7
6
|
import pyarrow as pa
|
|
8
|
-
from deltalake.exceptions import CommitFailedError
|
|
9
7
|
|
|
10
8
|
from .schema import Schema
|
|
11
9
|
from .slicing import HyperSlice
|
|
@@ -48,14 +46,6 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
|
48
46
|
return " AND ".join(sql_parts)
|
|
49
47
|
|
|
50
48
|
|
|
51
|
-
def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
|
|
52
|
-
return "arrow_cast should have been simplified to cast" in str(error)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def _is_safe_partition_filter(filter_: tuple) -> bool:
|
|
56
|
-
return filter_[1] in ["=", "in"]
|
|
57
|
-
|
|
58
|
-
|
|
59
49
|
def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
|
|
60
50
|
return filters if len(filters) > 0 else None
|
|
61
51
|
|
|
@@ -77,10 +67,6 @@ def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
|
|
|
77
67
|
raise ValueError(f"Unsupported operation: {op}")
|
|
78
68
|
|
|
79
69
|
|
|
80
|
-
class DeltaOverwriteFallbackWarning(RuntimeWarning):
|
|
81
|
-
pass
|
|
82
|
-
|
|
83
|
-
|
|
84
70
|
class Table:
|
|
85
71
|
def __init__(
|
|
86
72
|
self,
|
|
@@ -140,9 +126,9 @@ class Table:
|
|
|
140
126
|
if hyper_slice is None:
|
|
141
127
|
hyper_slice = []
|
|
142
128
|
|
|
143
|
-
# Generated filters are an optimization for partition pruning. Keep
|
|
144
|
-
# of row filters because partition values may be
|
|
145
|
-
#
|
|
129
|
+
# Generated filters are an optimization for partition pruning. Keep all
|
|
130
|
+
# partition filters out of row filters because partition values may be
|
|
131
|
+
# represented differently by PyArrow than by Delta's partition pruning.
|
|
146
132
|
partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
147
133
|
|
|
148
134
|
delta_table = self.delta_table
|
|
@@ -157,16 +143,12 @@ class Table:
|
|
|
157
143
|
[f for f in hyper_slice if f[0] not in partition_cols]
|
|
158
144
|
)
|
|
159
145
|
partition_filters = _empty_to_none(
|
|
160
|
-
[
|
|
161
|
-
f
|
|
162
|
-
for f in partition_hyper_slice
|
|
163
|
-
if f[0] in partition_cols and _is_safe_partition_filter(f)
|
|
164
|
-
]
|
|
146
|
+
[f for f in partition_hyper_slice if f[0] in partition_cols]
|
|
165
147
|
)
|
|
166
148
|
post_filters = [
|
|
167
149
|
f
|
|
168
150
|
for f in hyper_slice
|
|
169
|
-
if f[0] in partition_cols and not
|
|
151
|
+
if f[0] in partition_cols and f not in partition_hyper_slice
|
|
170
152
|
]
|
|
171
153
|
|
|
172
154
|
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
@@ -239,21 +221,7 @@ class Table:
|
|
|
239
221
|
else:
|
|
240
222
|
predicate = _dnf_to_sql(hyper_slice)
|
|
241
223
|
|
|
242
|
-
|
|
243
|
-
self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
|
|
244
|
-
except CommitFailedError as error:
|
|
245
|
-
if predicate is None or not _is_arrow_cast_commit_failure(error):
|
|
246
|
-
raise
|
|
247
|
-
|
|
248
|
-
warnings.warn(
|
|
249
|
-
"Delta predicate overwrite failed with the DataFusion arrow_cast bug. "
|
|
250
|
-
"Falling back to delete followed by append; this is not atomic as a "
|
|
251
|
-
"single overwrite commit.",
|
|
252
|
-
DeltaOverwriteFallbackWarning,
|
|
253
|
-
stacklevel=2,
|
|
254
|
-
)
|
|
255
|
-
self.delta_table.delete(predicate)
|
|
256
|
-
self._write_deltalake(data=data, mode="append", predicate=None)
|
|
224
|
+
self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
|
|
257
225
|
|
|
258
226
|
def append(self, df: pl.DataFrame) -> None:
|
|
259
227
|
"""Append rows to Delta Lake. This will write data to the Delta Lake.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datazone-sdk"
|
|
3
|
-
version = "6.0.1.
|
|
3
|
+
version = "6.0.1.dev7"
|
|
4
4
|
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
5
|
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
6
|
requires-python = ">=3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datazone_sdk-6.0.1.dev5 → datazone_sdk-6.0.1.dev7}/datazone/deltastorage/generated_columns.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|