datazone-sdk 6.0.1.dev4__tar.gz → 6.0.1.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/table.py +51 -5
  3. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/pyproject.toml +1 -1
  4. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/README.md +0 -0
  5. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/__init__.py +0 -0
  6. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/backtesting.py +0 -0
  7. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/caching/__init__.py +0 -0
  8. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/caching/parquet.py +0 -0
  9. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/__init__.py +0 -0
  10. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/base.py +0 -0
  11. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/cached.py +0 -0
  12. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/snapshot.py +0 -0
  13. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/db/standard.py +0 -0
  14. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/__init__.py +0 -0
  15. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev4 → datazone_sdk-6.0.1.dev5}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev4
3
+ Version: 6.0.1.dev5
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -52,6 +52,31 @@ def _is_arrow_cast_commit_failure(error: CommitFailedError) -> bool:
52
52
  return "arrow_cast should have been simplified to cast" in str(error)
53
53
 
54
54
 
55
+ def _is_safe_partition_filter(filter_: tuple) -> bool:
56
+ return filter_[1] in ["=", "in"]
57
+
58
+
59
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
60
+ return filters if len(filters) > 0 else None
61
+
62
+
63
+ def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
64
+ col, op, val = filter_
65
+ if op == "=":
66
+ return pl.col(col) == val
67
+ if op == "in":
68
+ return pl.col(col).is_in(val)
69
+ if op == ">=":
70
+ return pl.col(col) >= val
71
+ if op == "<=":
72
+ return pl.col(col) <= val
73
+ if op == ">":
74
+ return pl.col(col) > val
75
+ if op == "<":
76
+ return pl.col(col) < val
77
+ raise ValueError(f"Unsupported operation: {op}")
78
+
79
+
55
80
  class DeltaOverwriteFallbackWarning(RuntimeWarning):
56
81
  pass
57
82
 
@@ -115,8 +140,10 @@ class Table:
115
140
  if hyper_slice is None:
116
141
  hyper_slice = []
117
142
 
118
- # add generated filters to hyperslice
119
- hyper_slice = self.schema().add_generated_filters(hyper_slice)
143
+ # Generated filters are an optimization for partition pruning. Keep them out
144
+ # of row filters because partition values may be represented as strings by
145
+ # PyArrow even when the Delta schema has a richer logical type.
146
+ partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
120
147
 
121
148
  delta_table = self.delta_table
122
149
  partition_cols = delta_table.metadata().partition_columns
@@ -124,9 +151,23 @@ class Table:
124
151
  if len(hyper_slice) == 0:
125
152
  file_filters = None
126
153
  partition_filters = None
154
+ post_filters = []
127
155
  else:
128
- file_filters = hyper_slice
129
- partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
156
+ file_filters = _empty_to_none(
157
+ [f for f in hyper_slice if f[0] not in partition_cols]
158
+ )
159
+ partition_filters = _empty_to_none(
160
+ [
161
+ f
162
+ for f in partition_hyper_slice
163
+ if f[0] in partition_cols and _is_safe_partition_filter(f)
164
+ ]
165
+ )
166
+ post_filters = [
167
+ f
168
+ for f in hyper_slice
169
+ if f[0] in partition_cols and not _is_safe_partition_filter(f)
170
+ ]
130
171
 
131
172
  pyarrow_table_existing_data = delta_table.to_pyarrow_table(
132
173
  columns=columns,
@@ -134,7 +175,10 @@ class Table:
134
175
  filters=file_filters,
135
176
  )
136
177
 
137
- return pl.from_arrow(pyarrow_table_existing_data)
178
+ df = pl.from_arrow(pyarrow_table_existing_data)
179
+ for filter_ in post_filters:
180
+ df = df.filter(_filter_to_polars_expr(filter_))
181
+ return df
138
182
 
139
183
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
140
184
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -188,6 +232,8 @@ class Table:
188
232
  schema = self.schema()
189
233
  data = self._to_writable_pyarrow_table(df=df, schema=schema)
190
234
 
235
+ hyper_slice = schema.add_generated_filters(hyper_slice)
236
+
191
237
  if len(hyper_slice) == 0:
192
238
  predicate = None
193
239
  else:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev4"
3
+ version = "6.0.1.dev5"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"