datazone-sdk 6.0.1.dev10__tar.gz → 6.0.1.dev11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/PKG-INFO +1 -1
  2. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/table.py +22 -53
  3. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/pyproject.toml +1 -1
  4. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/README.md +0 -0
  5. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/__init__.py +0 -0
  6. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/backtesting.py +0 -0
  7. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/caching/__init__.py +0 -0
  8. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/caching/parquet.py +0 -0
  9. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/__init__.py +0 -0
  10. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/base.py +0 -0
  11. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/cached.py +0 -0
  12. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/snapshot.py +0 -0
  13. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/db/standard.py +0 -0
  14. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/__init__.py +0 -0
  15. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/data_types.py +0 -0
  16. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/generated_columns.py +0 -0
  17. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/schema.py +0 -0
  18. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/slicing.py +0 -0
  19. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/deltastorage/store.py +0 -0
  20. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/testing/__init__.py +0 -0
  21. {datazone_sdk-6.0.1.dev10 → datazone_sdk-6.0.1.dev11}/datazone/testing/database_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datazone-sdk
3
- Version: 6.0.1.dev10
3
+ Version: 6.0.1.dev11
4
4
  Summary: Database and Delta storage client library for working with Delta Lake tables
5
5
  Author: Team Enigma
6
6
  Author-email: enigma@energinet.dk
@@ -12,12 +12,12 @@ from .slicing import HyperSlice
12
12
  def _sql_literal(value: Any) -> str:
13
13
  """Render a Python value as a *type-correct* SQL literal for Delta predicates.
14
14
 
15
- Earlier versions quoted every value as a string (``f"{col} {op} '{val}'"``),
16
- which worked because the old delta-rs/datafusion implicitly cast the string
17
- literal to the column type. The upgraded datafusion type-checks predicates
18
- strictly and rejects comparing a non-string column to a string literal
19
- (e.g. ``Timestamp(us, "UTC") <= Utf8View``). So numbers are left unquoted,
20
- booleans become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
15
+ datafusion type-checks predicates and does not always coerce a quoted
16
+ string literal to the column type (in particular not inside ``IN (...)``,
17
+ where a string compared to a timestamp column raises
18
+ ``Timestamp(us, "UTC") <= Utf8View``). Rendering each value with its real
19
+ type avoids relying on that coercion: numbers are left unquoted, booleans
20
+ become ``TRUE``/``FALSE`` and ``None`` becomes ``NULL``; only real
21
21
  strings/dates are quoted (and escaped).
22
22
  """
23
23
  if isinstance(value, dt.datetime):
@@ -38,8 +38,16 @@ def _dnf_to_sql(dnf: list[tuple]) -> str:
38
38
  """Convert a hyper slice (DNF expression) to a Delta predicate SQL string.
39
39
 
40
40
  Needed because delta-rs overwrite/delete operations accept a SQL predicate
41
- string, not the tuple filter format the SDK uses everywhere else. ``in`` is
42
- expanded to ``OR`` of equalities since the predicate dialect has no ``IN``.
41
+ string, not the tuple filter format the SDK uses everywhere else.
42
+
43
+ ``in`` is rendered as an OR-of-equalities rather than SQL ``IN (...)`` on
44
+ purpose: datafusion coerces a quoted string literal to the column type for
45
+ the binary ``=`` operator, but NOT inside ``IN (...)``. So a timestamp
46
+ filter written as ``time_utc IN ('2025-01-01T...')`` raises
47
+ ``Invalid comparison operation: Timestamp(us, "UTC") <= Utf8View``, while
48
+ ``(time_utc = '2025-01-01T...')`` is coerced and works. (For columns whose
49
+ type already matches the literal, e.g. ints, ``IN`` would be fine -- but the
50
+ OR form is correct for every column type.)
43
51
  """
44
52
  if len(dnf) == 0:
45
53
  return "1=1"
@@ -71,28 +79,6 @@ def _empty_to_none(filters: list[tuple]) -> list[tuple] | None:
71
79
  return filters if len(filters) > 0 else None
72
80
 
73
81
 
74
- def _filter_to_polars_expr(filter_: tuple) -> pl.Expr:
75
- """Convert a single tuple filter to a Polars expression.
76
-
77
- Used to apply residual partition predicates as a Polars post-filter in
78
- ``Table.read`` after delta-rs partition pruning has narrowed the files read.
79
- """
80
- col, op, val = filter_
81
- if op == "=":
82
- return pl.col(col) == val
83
- if op == "in":
84
- return pl.col(col).is_in(val)
85
- if op == ">=":
86
- return pl.col(col) >= val
87
- if op == "<=":
88
- return pl.col(col) <= val
89
- if op == ">":
90
- return pl.col(col) > val
91
- if op == "<":
92
- return pl.col(col) < val
93
- raise ValueError(f"Unsupported operation: {op}")
94
-
95
-
96
82
  class Table:
97
83
  def __init__(
98
84
  self,
@@ -146,7 +132,7 @@ class Table:
146
132
  ) -> pl.DataFrame:
147
133
  """Read from Delta table.
148
134
 
149
- Filters are split into three groups so each is handled by the engine
135
+ Filters are split into two groups so each is handled by the engine
150
136
  best suited for it:
151
137
 
152
138
  * Partition filters (including filters derived from generated partition
@@ -154,8 +140,6 @@ class Table:
154
140
  before any data is read.
155
141
  * Non-partition filters go to ``filters=`` and are pushed down to the
156
142
  Parquet reader.
157
- * Residual partition predicates that cannot be expressed as a partition
158
- filter are applied afterwards as a Polars post-filter.
159
143
 
160
144
  Args:
161
145
  hyper_slice (HyperSlice): Hyper slice used to filter data.
@@ -164,10 +148,8 @@ class Table:
164
148
  if hyper_slice is None:
165
149
  hyper_slice = []
166
150
 
167
- # Generated filters are an optimization for partition pruning. Keep all
168
- # partition filters out of row filters because partition values may be
169
- # represented differently by PyArrow than by Delta's partition pruning.
170
- partition_hyper_slice = self.schema().add_generated_filters(hyper_slice)
151
+ # add generated filters to hyperslice
152
+ hyper_slice = self.schema().add_generated_filters(hyper_slice)
171
153
 
172
154
  delta_table = self.delta_table
173
155
  partition_cols = delta_table.metadata().partition_columns
@@ -175,19 +157,9 @@ class Table:
175
157
  if len(hyper_slice) == 0:
176
158
  file_filters = None
177
159
  partition_filters = None
178
- post_filters = []
179
160
  else:
180
- file_filters = _empty_to_none(
181
- [f for f in hyper_slice if f[0] not in partition_cols]
182
- )
183
- partition_filters = _empty_to_none(
184
- [f for f in partition_hyper_slice if f[0] in partition_cols]
185
- )
186
- post_filters = [
187
- f
188
- for f in hyper_slice
189
- if f[0] in partition_cols and f not in partition_hyper_slice
190
- ]
161
+ file_filters = hyper_slice
162
+ partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
191
163
 
192
164
  pyarrow_table_existing_data = delta_table.to_pyarrow_table(
193
165
  columns=columns,
@@ -195,10 +167,7 @@ class Table:
195
167
  filters=file_filters,
196
168
  )
197
169
 
198
- df = pl.from_arrow(pyarrow_table_existing_data)
199
- for filter_ in post_filters:
200
- df = df.filter(_filter_to_polars_expr(filter_))
201
- return df
170
+ return pl.from_arrow(pyarrow_table_existing_data)
202
171
 
203
172
  def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
204
173
  """Convert Polars dataframe to pyarrow table with casted schema.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datazone-sdk"
3
- version = "6.0.1.dev10"
3
+ version = "6.0.1.dev11"
4
4
  description = "Database and Delta storage client library for working with Delta Lake tables"
5
5
  authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
6
  requires-python = ">=3.10"