datazone-sdk 6.0.1.dev4__tar.gz → 6.0.1.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/__init__.py +1 -8
  3. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/__init__.py +1 -1
  4. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/table.py +48 -26
  5. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/pyproject.toml +1 -1
  6. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/README.md +0 -0
  7. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/backtesting.py +0 -0
  8. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/caching/__init__.py +0 -0
  9. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/caching/parquet.py +0 -0
  10. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/__init__.py +0 -0
  11. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/base.py +0 -0
  12. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/cached.py +0 -0
  13. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/snapshot.py +0 -0
  14. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/db/standard.py +0 -0
  15. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev6}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev4
3
+ Version: 6.0.1.dev6
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -1,13 +1,6 @@
1
1
  from datazone.backtesting import backtest
2
2
  from datazone.caching import ParquetCache
3
3
  from datazone.db import DatabaseClient, SnapshotDatabaseClient
4
- from datazone.deltastorage import (
5
- DeltaOverwriteFallbackWarning,
6
- Field,
7
- HyperSlice,
8
- Schema,
9
- Store,
10
- Table,
11
- )
4
+ from datazone.deltastorage import Field, HyperSlice, Schema, Store, Table
12
5
 
13
6
  from . import testing
@@ -1,4 +1,4 @@
1
1
  from .schema import Field, Schema
2
2
  from .slicing import HyperSlice
3
3
  from .store import Store
4
- from .table import DeltaOverwriteFallbackWarning, Table
4
+ from .table import Table
@@ -1,11 +1,9 @@
1
1
  import datetime as dt
2
- import warnings
3
2
  from typing import Any, Optional
4
3
 
5
4
  import deltalake as dl
6
5
  import polars as pl
7
6
  import pyarrow as pa
8
- from deltalake.exceptions import CommitFailedError
9
7
 
10
8
  from .schema import Schema
11
9
  from .slicing import HyperSlice
@@ -48,12 +46,29 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
48
46
  return " AND ".join(sql_parts)
49
47
 
50
48
 
51
- def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
52
- return "arrow_cast should have been simplified to cast" in str(error)
49
+ def _is_safe_partition_filter(filter_: tuple) -> bool:
50
+ return filter_[1] in ["=", "in"]
53
51
 
54
52
 
55
- class DeltaOverwriteFallbackWarning(RuntimeWarning):
56
- pass
53
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
54
+ return filters if len(filters) > 0 else None
55
+
56
+
57
+ def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
58
+ col, op, val = filter_
59
+ if op == "=":
60
+ return pl.col(col) == val
61
+ if op == "in":
62
+ return pl.col(col).is_in(val)
63
+ if op == ">=":
64
+ return pl.col(col) >= val
65
+ if op == "<=":
66
+ return pl.col(col) <= val
67
+ if op == ">":
68
+ return pl.col(col) > val
69
+ if op == "<":
70
+ return pl.col(col) < val
71
+ raise ValueError(f"Unsupported operation: {op}")
57
72
 
58
73
 
59
74
  class Table:
@@ -115,8 +130,10 @@ class Table:
115
130
  if hyper_slice is None:
116
131
  hyper_slice = []
117
132
 
118
- # add generated filters to hyperslice
119
- hyper_slice = self.schema().add_generated_filters(hyper_slice)
133
+ # Generated filters are an optimization for partition pruning. Keep them out
134
+ # of row filters because partition values may be represented as strings by
135
+ # PyArrow even when the Delta schema has a richer logical type.
136
+ partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
120
137
 
121
138
  delta_table = self.delta_table
122
139
  partition_cols = delta_table.metadata().partition_columns
@@ -124,9 +141,23 @@ class Table:
124
141
  if len(hyper_slice) == 0:
125
142
  file_filters = None
126
143
  partition_filters = None
144
+ post_filters = []
127
145
  else:
128
- file_filters = hyper_slice
129
- partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
146
+ file_filters = _empty_to_none(
147
+ [f for f in hyper_slice if f[0] not in partition_cols]
148
+ )
149
+ partition_filters = _empty_to_none(
150
+ [
151
+ f
152
+ for f in partition_hyper_slice
153
+ if f[0] in partition_cols and _is_safe_partition_filter(f)
154
+ ]
155
+ )
156
+ post_filters = [
157
+ f
158
+ for f in hyper_slice
159
+ if f[0] in partition_cols and not _is_safe_partition_filter(f)
160
+ ]
130
161
 
131
162
  pyarrow_table_existing_data = delta_table.to_pyarrow_table(
132
163
  columns=columns,
@@ -134,7 +165,10 @@ class Table:
134
165
  filters=file_filters,
135
166
  )
136
167
 
137
- return pl.from_arrow(pyarrow_table_existing_data)
168
+ df = pl.from_arrow(pyarrow_table_existing_data)
169
+ for filter_ in post_filters:
170
+ df = df.filter(_filter_to_polars_expr(filter_))
171
+ return df
138
172
 
139
173
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
140
174
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -188,26 +222,14 @@ class Table:
188
222
  schema = self.schema()
189
223
  data = self._to_writable_pyarrow_table(df=df, schema=schema)
190
224
 
225
+ hyper_slice = schema.add_generated_filters(hyper_slice)
226
+
191
227
  if len(hyper_slice) == 0:
192
228
  predicate = None
193
229
  else:
194
230
  predicate = _dnf_to_sql(hyper_slice)
195
231
 
196
- try:
197
- self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
198
- except CommitFailedError as error:
199
- if predicate is None or not _is_arrow_cast_commit_failure(error):
200
- raise
201
-
202
- warnings.warn(
203
- "Delta predicate overwrite failed with the DataFusion arrow_cast bug. "
204
- "Falling back to delete followed by append; this is not atomic as a "
205
- "single overwrite commit.",
206
- DeltaOverwriteFallbackWarning,
207
- stacklevel=2,
208
- )
209
- self.delta_table.delete(predicate)
210
- self._write_deltalake(data=data, mode="append", predicate=None)
232
+ self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
211
233
 
212
234
  def append(self, df: pl.DataFrame) -> None:
213
235
  """Append rows to Delta Lake. This will write data to the Delta Lake.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev4"
3
+ version = "6.0.1.dev6"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"