datazone-sdk 6.0.1.dev7__tar.gz → 6.0.1.dev9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/generated_columns.py +3 -2
  3. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/table.py +60 -39
  4. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/pyproject.toml +1 -1
  5. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/README.md +0 -0
  6. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/__init__.py +0 -0
  7. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/backtesting.py +0 -0
  8. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/caching/__init__.py +0 -0
  9. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/caching/parquet.py +0 -0
  10. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/db/__init__.py +0 -0
  11. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/db/base.py +0 -0
  12. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/db/cached.py +0 -0
  13. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/db/snapshot.py +0 -0
  14. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/db/standard.py +0 -0
  15. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/__init__.py +0 -0
  16. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/data_types.py +0 -0
  17. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev7 → datazone_sdk-6.0.1.dev9}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev7
3
+ Version: 6.0.1.dev9
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -112,8 +112,9 @@ class DateBucket(GeneratedColumn):
112
112
  case (">" | ">="):
113
113
  return [(">=", date_from_timestamp(value))]
114
114
  case "in":
115
- dates = [date_from_timestamp(timestamp) for timestamp in value]
116
- return [("in", list(dict.fromkeys(dates)))]
115
+ # de-duplicate: multiple timestamps can bucket to the same date
116
+ dates = {date_from_timestamp(timestamp) for timestamp in value}
117
+ return [("in", sorted(dates))]
117
118
  case _:
118
119
  # for other operations, we cannot make any
119
120
  # useful filters on the generated column
@@ -10,6 +10,16 @@ from .slicing import HyperSlice
10
10
 
11
11
 
12
12
  def _sql_literal(value: Any) -> str:
13
+ """Render a Python value as a *type-correct* SQL literal for Delta predicates.
14
+
15
+ Earlier versions quoted every value as a string (``f"{col} {op} '{val}'"``),
16
+ which worked because the old delta-rs/datafusion implicitly cast the string
17
+ literal to the column type. The upgraded datafusion type-checks predicates
18
+ strictly and rejects comparing a non-string column to a string literal
19
+ (e.g. ``Timestamp(us, "UTC") <= Utf8View``). So numbers are left unquoted,
20
+ booleans become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
21
+ strings/dates are quoted (and escaped).
22
+ """
13
23
  if isinstance(value, dt.datetime):
14
24
  return f"'{value.isoformat()}'"
15
25
  if isinstance(value, dt.date):
@@ -25,7 +35,12 @@ def _sql_literal(value: Any) -> str:
25
35
 
26
36
 
27
37
  def _dnf_to_sql(dnf: list[tuple]) -> str:
28
- """Convert DNF expression to SQL expression."""
38
+ """Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
39
+
40
+ Needed because delta-rs overwrite/delete operations accept a SQL predicate
41
+ string, not the tuple filter format the SDK uses everywhere else. ``in`` is
42
+ expanded to ``OR`` of equalities since the predicate dialect has no ``IN``.
43
+ """
29
44
  if len(dnf) == 0:
30
45
  return "1=1"
31
46
 
@@ -46,11 +61,14 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
46
61
  return " AND ".join(sql_parts)
47
62
 
48
63
 
49
- def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
50
- return filters if len(filters) > 0 else None
51
-
52
-
53
64
  def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
65
+ """Convert a single tuple filter to a Polars expression.
66
+
67
+ Needed so the SDK's ``(column, op, value)`` filters can be pushed down as
68
+ predicates to the native Polars Delta reader in ``Table.read``, instead of
69
+ PyArrow filters, whose compute kernels fail on delta-rs ``string_view``
70
+ columns (``ArrowNotImplementedError``).
71
+ """
54
72
  col, op, val = filter_
55
73
  if op == "=":
56
74
  return pl.col(col) == val
@@ -118,49 +136,52 @@ class Table:
118
136
  def read(
119
137
  self, hyper_slice: Optional[HyperSlice] = None, columns=None
120
138
  ) -> pl.DataFrame:
121
- """Read from Delta table
139
+ """Read from Delta table.
140
+
141
+ All filters are pushed down to the native Delta reader. Partition
142
+ filters (including filters derived from generated partition columns)
143
+ result in partition pruning, while non-partition filters are pushed
144
+ down to the Parquet reader as predicates.
145
+
146
+ We use the native Polars Delta reader (`pl.scan_delta`) rather than
147
+ `DeltaTable.to_pyarrow_table(filters=...)` on purpose. The PyArrow
148
+ filter path evaluates predicates with PyArrow compute kernels, which
149
+ do not implement comparisons for the `string_view` type returned by
150
+ delta-rs. That mismatch raises errors such as
151
+ `ArrowNotImplementedError: Function 'greater_equal' has no kernel
152
+ matching input types (string_view, string_view)`. The native reader
153
+ evaluates predicates in its own engine and avoids this entirely.
122
154
 
123
155
  Args:
124
- hyper_slice (HyperSlice): Hyper sliced used to filter data
156
+ hyper_slice (HyperSlice): Hyper slice used to filter data.
157
+ columns: Optional list of columns to project.
125
158
  """
126
159
  if hyper_slice is None:
127
160
  hyper_slice = []
128
161
 
129
- # Generated filters are an optimization for partition pruning. Keep all
130
- # partition filters out of row filters because partition values may be
131
- # represented differently by PyArrow than by Delta's partition pruning.
132
- partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
162
+ # Generated filters add predicates on generated partition columns
163
+ # (e.g. `date_utc` derived from `time_utc`) so the native reader can
164
+ # prune partitions even when the caller only filters the base column.
165
+ pushdown_slice = self.schema().add_generated_filters(hyper_slice)
166
+
167
+ # `credential_provider=None` makes Polars pass `storage_options` straight
168
+ # to the native object store, exactly like `dl.DeltaTable(...)` does. We
169
+ # avoid the default `credential_provider="auto"`, which would build a
170
+ # separate Polars-managed Azure credential and could diverge from the
171
+ # managed-identity / Azure CLI auth used everywhere else in this class.
172
+ lazy_frame = pl.scan_delta(
173
+ self.url,
174
+ storage_options=self.storage_options,
175
+ credential_provider=None,
176
+ )
133
177
 
134
- delta_table = self.delta_table
135
- partition_cols = delta_table.metadata().partition_columns
178
+ for filter_ in pushdown_slice:
179
+ lazy_frame = lazy_frame.filter(_filter_to_polars_expr(filter_))
136
180
 
137
- if len(hyper_slice) == 0:
138
- file_filters = None
139
- partition_filters = None
140
- post_filters = []
141
- else:
142
- file_filters = _empty_to_none(
143
- [f for f in hyper_slice if f[0] not in partition_cols]
144
- )
145
- partition_filters = _empty_to_none(
146
- [f for f in partition_hyper_slice if f[0] in partition_cols]
147
- )
148
- post_filters = [
149
- f
150
- for f in hyper_slice
151
- if f[0] in partition_cols and f not in partition_hyper_slice
152
- ]
153
-
154
- pyarrow_table_existing_data = delta_table.to_pyarrow_table(
155
- columns=columns,
156
- partitions=partition_filters,
157
- filters=file_filters,
158
- )
181
+ if columns is not None:
182
+ lazy_frame = lazy_frame.select(columns)
159
183
 
160
- df = pl.from_arrow(pyarrow_table_existing_data)
161
- for filter_ in post_filters:
162
- df = df.filter(_filter_to_polars_expr(filter_))
163
- return df
184
+ return lazy_frame.collect()
164
185
 
165
186
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
166
187
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev7"
3
+ version = "6.0.1.dev9"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"