datazone-sdk 6.0.1.dev3__tar.gz → 6.0.1.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/__init__.py +8 -1
  3. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/__init__.py +1 -1
  4. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/table.py +63 -5
  5. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/pyproject.toml +1 -1
  6. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/README.md +0 -0
  7. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/backtesting.py +0 -0
  8. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/caching/__init__.py +0 -0
  9. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/caching/parquet.py +0 -0
  10. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/__init__.py +0 -0
  11. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/base.py +0 -0
  12. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/cached.py +0 -0
  13. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/snapshot.py +0 -0
  14. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/db/standard.py +0 -0
  15. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev3 → datazone_sdk-6.0.1.dev5}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev3
3
+ Version: 6.0.1.dev5
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -1,6 +1,13 @@
1
1
  from datazone.backtesting import backtest
2
2
  from datazone.caching import ParquetCache
3
3
  from datazone.db import DatabaseClient, SnapshotDatabaseClient
4
- from datazone.deltastorage import Field, HyperSlice, Schema, Store, Table
4
+ from datazone.deltastorage import (
5
+ DeltaOverwriteFallbackWarning,
6
+ Field,
7
+ HyperSlice,
8
+ Schema,
9
+ Store,
10
+ Table,
11
+ )
5
12
 
6
13
  from . import testing
@@ -1,4 +1,4 @@
1
1
  from .schema import Field, Schema
2
2
  from .slicing import HyperSlice
3
3
  from .store import Store
4
- from .table import Table
4
+ from .table import DeltaOverwriteFallbackWarning, Table
@@ -1,4 +1,5 @@
1
1
  import datetime as dt
2
+ import warnings
2
3
  from typing import Any, Optional
3
4
 
4
5
  import deltalake as dl
@@ -51,6 +52,35 @@ def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
51
52
  return "arrow_cast should have been simplified to cast" in str(error)
52
53
 
53
54
 
55
+ def _is_safe_partition_filter(filter_: tuple) -> bool:
56
+ return filter_[1] in ["=", "in"]
57
+
58
+
59
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
60
+ return filters if len(filters) > 0 else None
61
+
62
+
63
+ def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
64
+ col, op, val = filter_
65
+ if op == "=":
66
+ return pl.col(col) == val
67
+ if op == "in":
68
+ return pl.col(col).is_in(val)
69
+ if op == ">=":
70
+ return pl.col(col) >= val
71
+ if op == "<=":
72
+ return pl.col(col) <= val
73
+ if op == ">":
74
+ return pl.col(col) > val
75
+ if op == "<":
76
+ return pl.col(col) < val
77
+ raise ValueError(f"Unsupported operation: {op}")
78
+
79
+
80
+ class DeltaOverwriteFallbackWarning(RuntimeWarning):
81
+ pass
82
+
83
+
54
84
  class Table:
55
85
  def __init__(
56
86
  self,
@@ -110,8 +140,10 @@ class Table:
110
140
  if hyper_slice is None:
111
141
  hyper_slice = []
112
142
 
113
- # add generated filters to hyperslice
114
- hyper_slice = self.schema().add_generated_filters(hyper_slice)
143
+ # Generated filters are an optimization for partition pruning. Keep them out
144
+ # of row filters because partition values may be represented as strings by
145
+ # PyArrow even when the Delta schema has a richer logical type.
146
+ partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
115
147
 
116
148
  delta_table = self.delta_table
117
149
  partition_cols = delta_table.metadata().partition_columns
@@ -119,9 +151,23 @@ class Table:
119
151
  if len(hyper_slice) == 0:
120
152
  file_filters = None
121
153
  partition_filters = None
154
+ post_filters = []
122
155
  else:
123
- file_filters = hyper_slice
124
- partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
156
+ file_filters = _empty_to_none(
157
+ [f for f in hyper_slice if f[0] not in partition_cols]
158
+ )
159
+ partition_filters = _empty_to_none(
160
+ [
161
+ f
162
+ for f in partition_hyper_slice
163
+ if f[0] in partition_cols and _is_safe_partition_filter(f)
164
+ ]
165
+ )
166
+ post_filters = [
167
+ f
168
+ for f in hyper_slice
169
+ if f[0] in partition_cols and not _is_safe_partition_filter(f)
170
+ ]
125
171
 
126
172
  pyarrow_table_existing_data = delta_table.to_pyarrow_table(
127
173
  columns=columns,
@@ -129,7 +175,10 @@ class Table:
129
175
  filters=file_filters,
130
176
  )
131
177
 
132
- return pl.from_arrow(pyarrow_table_existing_data)
178
+ df = pl.from_arrow(pyarrow_table_existing_data)
179
+ for filter_ in post_filters:
180
+ df = df.filter(_filter_to_polars_expr(filter_))
181
+ return df
133
182
 
134
183
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
135
184
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -183,6 +232,8 @@ class Table:
183
232
  schema = self.schema()
184
233
  data = self._to_writable_pyarrow_table(df=df, schema=schema)
185
234
 
235
+ hyper_slice = schema.add_generated_filters(hyper_slice)
236
+
186
237
  if len(hyper_slice) == 0:
187
238
  predicate = None
188
239
  else:
@@ -194,6 +245,13 @@ class Table:
194
245
  if predicate is None or not _is_arrow_cast_commit_failure(error):
195
246
  raise
196
247
 
248
+ warnings.warn(
249
+ "Delta predicate overwrite failed with the DataFusion arrow_cast bug. "
250
+ "Falling back to delete followed by append; this is not atomic as a "
251
+ "single overwrite commit.",
252
+ DeltaOverwriteFallbackWarning,
253
+ stacklevel=2,
254
+ )
197
255
  self.delta_table.delete(predicate)
198
256
  self._write_deltalake(data=data, mode="append", predicate=None)
199
257
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev3"
3
+ version = "6.0.1.dev5"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"