datazone-sdk 6.0.1.dev8__tar.gz → 6.0.1.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/generated_columns.py +3 -2
  3. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/table.py +72 -36
  4. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/pyproject.toml +1 -1
  5. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/README.md +0 -0
  6. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/__init__.py +0 -0
  7. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/backtesting.py +0 -0
  8. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/caching/__init__.py +0 -0
  9. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/caching/parquet.py +0 -0
  10. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/db/__init__.py +0 -0
  11. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/db/base.py +0 -0
  12. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/db/cached.py +0 -0
  13. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/db/snapshot.py +0 -0
  14. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/db/standard.py +0 -0
  15. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/__init__.py +0 -0
  16. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/data_types.py +0 -0
  17. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev8 → datazone_sdk-6.0.1.dev10}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev8
3
+ Version: 6.0.1.dev10
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -112,8 +112,9 @@ class DateBucket(GeneratedColumn):
112
112
  case (">" | ">="):
113
113
  return [(">=", date_from_timestamp(value))]
114
114
  case "in":
115
- dates = [date_from_timestamp(timestamp) for timestamp in value]
116
- return [("in", list(dict.fromkeys(dates)))]
115
+ # de-duplicate: multiple timestamps can bucket to the same date
116
+ dates = {date_from_timestamp(timestamp) for timestamp in value}
117
+ return [("in", sorted(dates))]
117
118
  case _:
118
119
  # for other operations, we cannot make any
119
120
  # useful filters on the generated column
@@ -10,6 +10,16 @@ from .slicing import HyperSlice
10
10
 
11
11
 
12
12
  def _sql_literal(value: Any) -> str:
13
+ """Render a Python value as a *type-correct* SQL literal for Delta predicates.
14
+
15
+ Earlier versions quoted every value as a string (``f"{col} {op} '{val}'"``),
16
+ which worked because the old delta-rs/datafusion implicitly cast the string
17
+ literal to the column type. The upgraded datafusion type-checks predicates
18
+ strictly and rejects comparing a non-string column to a string literal
19
+ (e.g. ``Timestamp(us, "UTC") <= Utf8View``). So numbers are left unquoted,
20
+ booleans become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
21
+ strings/dates are quoted (and escaped).
22
+ """
13
23
  if isinstance(value, dt.datetime):
14
24
  return f"'{value.isoformat()}'"
15
25
  if isinstance(value, dt.date):
@@ -25,7 +35,12 @@ def _sql_literal(value: Any) -> str:
25
35
 
26
36
 
27
37
  def _dnf_to_sql(dnf: list[tuple]) -> str:
28
- """Convert DNF expression to SQL expression."""
38
+ """Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
39
+
40
+ Needed because delta-rs overwrite/delete operations accept a SQL predicate
41
+ string, not the tuple filter format the SDK uses everywhere else. ``in`` is
42
+ expanded to ``OR`` of equalities since the predicate dialect has no ``IN``.
43
+ """
29
44
  if len(dnf) == 0:
30
45
  return "1=1"
31
46
 
@@ -46,7 +61,22 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
46
61
  return " AND ".join(sql_parts)
47
62
 
48
63
 
64
+ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
65
+ """Return ``None`` for an empty filter list.
66
+
67
+ ``DeltaTable.to_pyarrow_table`` treats ``None`` as "no filter", so callers
68
+ must pass ``None`` rather than an empty list when there is nothing to filter
69
+ on.
70
+ """
71
+ return filters if len(filters) > 0 else None
72
+
73
+
49
74
  def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
75
+ """Convert a single tuple filter to a Polars expression.
76
+
77
+ Used to apply residual partition predicates as a Polars post-filter in
78
+ ``Table.read`` after delta-rs partition pruning has narrowed the files read.
79
+ """
50
80
  col, op, val = filter_
51
81
  if op == "=":
52
82
  return pl.col(col) == val
@@ -116,22 +146,16 @@ class Table:
116
146
  ) -> pl.DataFrame:
117
147
  """Read from Delta table.
118
148
 
119
- All filters are pushed down to the native Delta reader. Partition
120
- filters (including filters derived from generated partition columns)
121
- result in partition pruning, while non-partition filters are pushed
122
- down to the Parquet reader as predicates.
123
-
124
- We use the native Polars Delta reader (`pl.scan_delta`) rather than
125
- `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
126
- filter path evaluates predicates with PyArrow compute kernels, which
127
- do not implement comparisons for the `string_view` type returned by
128
- delta-rs. That mismatch raises errors such as
129
- `ArrowNotImplementedError: Function 'greater_equal' has no kernel
130
- matching input types (string_view, string_view)`. The native reader
131
- evaluates predicates in its own engine and avoids this entirely.
149
+ Filters are split into three groups so each is handled by the engine
150
+ best suited for it:
132
151
 
133
- Any further (post-)filtering that cannot be expressed as a pushdown
134
- predicate is the responsibility of the caller.
152
+ * Partition filters (including filters derived from generated partition
153
+ columns) go to ``partitions=`` so delta-rs prunes whole partitions
154
+ before any data is read.
155
+ * Non-partition filters go to ``filters=`` and are pushed down to the
156
+ Parquet reader.
157
+ * Residual partition predicates that cannot be expressed as a partition
158
+ filter are applied afterwards as a Polars post-filter.
135
159
 
136
160
  Args:
137
161
  hyper_slice (HyperSlice): Hyper slice used to filter data.
@@ -140,29 +164,41 @@ class Table:
140
164
  if hyper_slice is None:
141
165
  hyper_slice = []
142
166
 
143
- # Generated filters add predicates on generated partition columns
144
- # (e.g. `date_utc` derived from `time_utc`) so the native reader can
145
- # prune partitions even when the caller only filters the base column.
146
- pushdown_slice = self.schema().add_generated_filters(hyper_slice)
147
-
148
- # `credential_provider=None` makes Polars pass `storage_options` straight
149
- # to the native object store, exactly like `dl.DeltaTable(...)` does. We
150
- # avoid the default `credential_provider="auto"`, which would build a
151
- # separate Polars-managed Azure credential and could diverge from the
152
- # managed-identity / Azure CLI auth used everywhere else in this class.
153
- lazy_frame = pl.scan_delta(
154
- self.url,
155
- storage_options=self.storage_options,
156
- credential_provider=None,
157
- )
167
+ # Generated filters are an optimization for partition pruning. Keep all
168
+ # partition filters out of row filters because partition values may be
169
+ # represented differently by PyArrow than by Delta's partition pruning.
170
+ partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
158
171
 
159
- for filter_ in pushdown_slice:
160
- lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
172
+ delta_table = self.delta_table
173
+ partition_cols = delta_table.metadata().partition_columns
161
174
 
162
- if columns is not None:
163
- lazy_frame = lazy_frame.select(columns)
175
+ if len(hyper_slice) == 0:
176
+ file_filters = None
177
+ partition_filters = None
178
+ post_filters = []
179
+ else:
180
+ file_filters = _empty_to_none(
181
+ [f for f in hyper_slice if f[0] not in partition_cols]
182
+ )
183
+ partition_filters = _empty_to_none(
184
+ [f for f in partition_hyper_slice if f[0] in partition_cols]
185
+ )
186
+ post_filters = [
187
+ f
188
+ for f in hyper_slice
189
+ if f[0] in partition_cols and f not in partition_hyper_slice
190
+ ]
191
+
192
+ pyarrow_table_existing_data = delta_table.to_pyarrow_table(
193
+ columns=columns,
194
+ partitions=partition_filters,
195
+ filters=file_filters,
196
+ )
164
197
 
165
- return lazy_frame.collect()
198
+ df = pl.from_arrow(pyarrow_table_existing_data)
199
+ for filter_ in post_filters:
200
+ df = df.filter(_filter_to_polars_expr(filter_))
201
+ return df
166
202
 
167
203
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
168
204
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev8"
3
+ version = "6.0.1.dev10"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"