datazone-sdk 6.0.1.dev6__tar.gz → 6.0.1.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/table.py +40 -46
  3. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/pyproject.toml +1 -1
  4. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/README.md +0 -0
  5. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/__init__.py +0 -0
  6. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/backtesting.py +0 -0
  7. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/caching/__init__.py +0 -0
  8. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/caching/parquet.py +0 -0
  9. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/__init__.py +0 -0
  10. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/base.py +0 -0
  11. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/cached.py +0 -0
  12. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/snapshot.py +0 -0
  13. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/db/standard.py +0 -0
  14. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/__init__.py +0 -0
  15. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev6 → datazone_sdk-6.0.1.dev8}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev6
3
+ Version: 6.0.1.dev8
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -46,14 +46,6 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
46
46
  return " AND ".join(sql_parts)
47
47
 
48
48
 
49
- def _is_safe_partition_filter(filter_: tuple) -> bool:
50
- return filter_[1] in ["=", "in"]
51
-
52
-
53
- def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
54
- return filters if len(filters) > 0 else None
55
-
56
-
57
49
  def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
58
50
  col, op, val = filter_
59
51
  if op == "=":
@@ -122,53 +114,55 @@ class Table:
122
114
  def read(
123
115
  self, hyper_slice: Optional[HyperSlice] = None, columns=None
124
116
  ) -> pl.DataFrame:
125
- """Read from Delta table
117
+ """Read from Delta table.
118
+
119
+ All filters are pushed down to the native Delta reader. Partition
120
+ filters (including filters derived from generated partition columns)
121
+ result in partition pruning, while non-partition filters are pushed
122
+ down to the Parquet reader as predicates.
123
+
124
+ We use the native Polars Delta reader (`pl.scan_delta`) rather than
125
+ `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
126
+ filter path evaluates predicates with PyArrow compute kernels, which
127
+ do not implement comparisons for the `string_view` type returned by
128
+ delta-rs. That mismatch raises errors such as
129
+ `ArrowNotImplementedError: Function 'greater_equal' has no kernel
130
+ matching input types (string_view, string_view)`. The native reader
131
+ evaluates predicates in its own engine and avoids this entirely.
132
+
133
+ Any further (post-)filtering that cannot be expressed as a pushdown
134
+ predicate is the responsibility of the caller.
126
135
 
127
136
  Args:
128
- hyper_slice (HyperSlice): Hyper sliced used to filter data
137
+ hyper_slice (HyperSlice): Hyper slice used to filter data.
138
+ columns: Optional list of columns to project.
129
139
  """
130
140
  if hyper_slice is None:
131
141
  hyper_slice = []
132
142
 
133
- # Generated filters are an optimization for partition pruning. Keep them out
134
- # of row filters because partition values may be represented as strings by
135
- # PyArrow even when the Delta schema has a richer logical type.
136
- partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
143
+ # Generated filters add predicates on generated partition columns
144
+ # (e.g. `date_utc` derived from `time_utc`) so the native reader can
145
+ # prune partitions even when the caller only filters the base column.
146
+ pushdown_slice = self.schema().add_generated_filters(hyper_slice)
147
+
148
+ # `credential_provider=None` makes Polars pass `storage_options` straight
149
+ # to the native object store, exactly like `dl.DeltaTable(...)` does. We
150
+ # avoid the default `credential_provider="auto"`, which would build a
151
+ # separate Polars-managed Azure credential and could diverge from the
152
+ # managed-identity / Azure CLI auth used everywhere else in this class.
153
+ lazy_frame = pl.scan_delta(
154
+ self.url,
155
+ storage_options=self.storage_options,
156
+ credential_provider=None,
157
+ )
137
158
 
138
- delta_table = self.delta_table
139
- partition_cols = delta_table.metadata().partition_columns
159
+ for filter_ in pushdown_slice:
160
+ lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
140
161
 
141
- if len(hyper_slice) == 0:
142
- file_filters = None
143
- partition_filters = None
144
- post_filters = []
145
- else:
146
- file_filters = _empty_to_none(
147
- [f for f in hyper_slice if f[0] not in partition_cols]
148
- )
149
- partition_filters = _empty_to_none(
150
- [
151
- f
152
- for f in partition_hyper_slice
153
- if f[0] in partition_cols and _is_safe_partition_filter(f)
154
- ]
155
- )
156
- post_filters = [
157
- f
158
- for f in hyper_slice
159
- if f[0] in partition_cols and not _is_safe_partition_filter(f)
160
- ]
161
-
162
- pyarrow_table_existing_data = delta_table.to_pyarrow_table(
163
- columns=columns,
164
- partitions=partition_filters,
165
- filters=file_filters,
166
- )
162
+ if columns is not None:
163
+ lazy_frame = lazy_frame.select(columns)
167
164
 
168
- df = pl.from_arrow(pyarrow_table_existing_data)
169
- for filter_ in post_filters:
170
- df = df.filter(_filter_to_polars_expr(filter_))
171
- return df
165
+ return lazy_frame.collect()
172
166
 
173
167
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
174
168
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev6"
3
+ version = "6.0.1.dev8"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"