polars-runtime-compat 1.34.0b2__cp39-abi3-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/.gitkeep +0 -0
  2. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  3. polars/__init__.py +528 -0
  4. polars/_cpu_check.py +265 -0
  5. polars/_dependencies.py +355 -0
  6. polars/_plr.py +99 -0
  7. polars/_plr.pyi +2496 -0
  8. polars/_reexport.py +23 -0
  9. polars/_typing.py +478 -0
  10. polars/_utils/__init__.py +37 -0
  11. polars/_utils/async_.py +102 -0
  12. polars/_utils/cache.py +176 -0
  13. polars/_utils/cloud.py +40 -0
  14. polars/_utils/constants.py +29 -0
  15. polars/_utils/construction/__init__.py +46 -0
  16. polars/_utils/construction/dataframe.py +1397 -0
  17. polars/_utils/construction/other.py +72 -0
  18. polars/_utils/construction/series.py +560 -0
  19. polars/_utils/construction/utils.py +118 -0
  20. polars/_utils/convert.py +224 -0
  21. polars/_utils/deprecation.py +406 -0
  22. polars/_utils/getitem.py +457 -0
  23. polars/_utils/logging.py +11 -0
  24. polars/_utils/nest_asyncio.py +264 -0
  25. polars/_utils/parquet.py +15 -0
  26. polars/_utils/parse/__init__.py +12 -0
  27. polars/_utils/parse/expr.py +242 -0
  28. polars/_utils/polars_version.py +19 -0
  29. polars/_utils/pycapsule.py +53 -0
  30. polars/_utils/scan.py +27 -0
  31. polars/_utils/serde.py +63 -0
  32. polars/_utils/slice.py +215 -0
  33. polars/_utils/udfs.py +1251 -0
  34. polars/_utils/unstable.py +63 -0
  35. polars/_utils/various.py +782 -0
  36. polars/_utils/wrap.py +25 -0
  37. polars/api.py +370 -0
  38. polars/catalog/__init__.py +0 -0
  39. polars/catalog/unity/__init__.py +19 -0
  40. polars/catalog/unity/client.py +733 -0
  41. polars/catalog/unity/models.py +152 -0
  42. polars/config.py +1571 -0
  43. polars/convert/__init__.py +25 -0
  44. polars/convert/general.py +1046 -0
  45. polars/convert/normalize.py +261 -0
  46. polars/dataframe/__init__.py +5 -0
  47. polars/dataframe/_html.py +186 -0
  48. polars/dataframe/frame.py +12582 -0
  49. polars/dataframe/group_by.py +1067 -0
  50. polars/dataframe/plotting.py +257 -0
  51. polars/datatype_expr/__init__.py +5 -0
  52. polars/datatype_expr/array.py +56 -0
  53. polars/datatype_expr/datatype_expr.py +304 -0
  54. polars/datatype_expr/list.py +18 -0
  55. polars/datatype_expr/struct.py +69 -0
  56. polars/datatypes/__init__.py +122 -0
  57. polars/datatypes/_parse.py +195 -0
  58. polars/datatypes/_utils.py +48 -0
  59. polars/datatypes/classes.py +1213 -0
  60. polars/datatypes/constants.py +11 -0
  61. polars/datatypes/constructor.py +172 -0
  62. polars/datatypes/convert.py +366 -0
  63. polars/datatypes/group.py +130 -0
  64. polars/exceptions.py +230 -0
  65. polars/expr/__init__.py +7 -0
  66. polars/expr/array.py +964 -0
  67. polars/expr/binary.py +346 -0
  68. polars/expr/categorical.py +306 -0
  69. polars/expr/datetime.py +2620 -0
  70. polars/expr/expr.py +11272 -0
  71. polars/expr/list.py +1408 -0
  72. polars/expr/meta.py +444 -0
  73. polars/expr/name.py +321 -0
  74. polars/expr/string.py +3045 -0
  75. polars/expr/struct.py +357 -0
  76. polars/expr/whenthen.py +185 -0
  77. polars/functions/__init__.py +193 -0
  78. polars/functions/aggregation/__init__.py +33 -0
  79. polars/functions/aggregation/horizontal.py +298 -0
  80. polars/functions/aggregation/vertical.py +341 -0
  81. polars/functions/as_datatype.py +848 -0
  82. polars/functions/business.py +138 -0
  83. polars/functions/col.py +384 -0
  84. polars/functions/datatype.py +121 -0
  85. polars/functions/eager.py +524 -0
  86. polars/functions/escape_regex.py +29 -0
  87. polars/functions/lazy.py +2751 -0
  88. polars/functions/len.py +68 -0
  89. polars/functions/lit.py +210 -0
  90. polars/functions/random.py +22 -0
  91. polars/functions/range/__init__.py +19 -0
  92. polars/functions/range/_utils.py +15 -0
  93. polars/functions/range/date_range.py +303 -0
  94. polars/functions/range/datetime_range.py +370 -0
  95. polars/functions/range/int_range.py +348 -0
  96. polars/functions/range/linear_space.py +311 -0
  97. polars/functions/range/time_range.py +287 -0
  98. polars/functions/repeat.py +301 -0
  99. polars/functions/whenthen.py +353 -0
  100. polars/interchange/__init__.py +10 -0
  101. polars/interchange/buffer.py +77 -0
  102. polars/interchange/column.py +190 -0
  103. polars/interchange/dataframe.py +230 -0
  104. polars/interchange/from_dataframe.py +328 -0
  105. polars/interchange/protocol.py +303 -0
  106. polars/interchange/utils.py +170 -0
  107. polars/io/__init__.py +64 -0
  108. polars/io/_utils.py +317 -0
  109. polars/io/avro.py +49 -0
  110. polars/io/clipboard.py +36 -0
  111. polars/io/cloud/__init__.py +17 -0
  112. polars/io/cloud/_utils.py +80 -0
  113. polars/io/cloud/credential_provider/__init__.py +17 -0
  114. polars/io/cloud/credential_provider/_builder.py +520 -0
  115. polars/io/cloud/credential_provider/_providers.py +618 -0
  116. polars/io/csv/__init__.py +9 -0
  117. polars/io/csv/_utils.py +38 -0
  118. polars/io/csv/batched_reader.py +142 -0
  119. polars/io/csv/functions.py +1495 -0
  120. polars/io/database/__init__.py +6 -0
  121. polars/io/database/_arrow_registry.py +70 -0
  122. polars/io/database/_cursor_proxies.py +147 -0
  123. polars/io/database/_executor.py +578 -0
  124. polars/io/database/_inference.py +314 -0
  125. polars/io/database/_utils.py +144 -0
  126. polars/io/database/functions.py +516 -0
  127. polars/io/delta.py +499 -0
  128. polars/io/iceberg/__init__.py +3 -0
  129. polars/io/iceberg/_utils.py +697 -0
  130. polars/io/iceberg/dataset.py +556 -0
  131. polars/io/iceberg/functions.py +151 -0
  132. polars/io/ipc/__init__.py +8 -0
  133. polars/io/ipc/functions.py +514 -0
  134. polars/io/json/__init__.py +3 -0
  135. polars/io/json/read.py +101 -0
  136. polars/io/ndjson.py +332 -0
  137. polars/io/parquet/__init__.py +17 -0
  138. polars/io/parquet/field_overwrites.py +140 -0
  139. polars/io/parquet/functions.py +722 -0
  140. polars/io/partition.py +491 -0
  141. polars/io/plugins.py +187 -0
  142. polars/io/pyarrow_dataset/__init__.py +5 -0
  143. polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
  144. polars/io/pyarrow_dataset/functions.py +79 -0
  145. polars/io/scan_options/__init__.py +5 -0
  146. polars/io/scan_options/_options.py +59 -0
  147. polars/io/scan_options/cast_options.py +126 -0
  148. polars/io/spreadsheet/__init__.py +6 -0
  149. polars/io/spreadsheet/_utils.py +52 -0
  150. polars/io/spreadsheet/_write_utils.py +647 -0
  151. polars/io/spreadsheet/functions.py +1323 -0
  152. polars/lazyframe/__init__.py +9 -0
  153. polars/lazyframe/engine_config.py +61 -0
  154. polars/lazyframe/frame.py +8564 -0
  155. polars/lazyframe/group_by.py +669 -0
  156. polars/lazyframe/in_process.py +42 -0
  157. polars/lazyframe/opt_flags.py +333 -0
  158. polars/meta/__init__.py +14 -0
  159. polars/meta/build.py +33 -0
  160. polars/meta/index_type.py +27 -0
  161. polars/meta/thread_pool.py +50 -0
  162. polars/meta/versions.py +120 -0
  163. polars/ml/__init__.py +0 -0
  164. polars/ml/torch.py +213 -0
  165. polars/ml/utilities.py +30 -0
  166. polars/plugins.py +155 -0
  167. polars/py.typed +0 -0
  168. polars/pyproject.toml +96 -0
  169. polars/schema.py +265 -0
  170. polars/selectors.py +3117 -0
  171. polars/series/__init__.py +5 -0
  172. polars/series/array.py +776 -0
  173. polars/series/binary.py +254 -0
  174. polars/series/categorical.py +246 -0
  175. polars/series/datetime.py +2275 -0
  176. polars/series/list.py +1087 -0
  177. polars/series/plotting.py +191 -0
  178. polars/series/series.py +9197 -0
  179. polars/series/string.py +2367 -0
  180. polars/series/struct.py +154 -0
  181. polars/series/utils.py +191 -0
  182. polars/sql/__init__.py +7 -0
  183. polars/sql/context.py +677 -0
  184. polars/sql/functions.py +139 -0
  185. polars/string_cache.py +185 -0
  186. polars/testing/__init__.py +13 -0
  187. polars/testing/asserts/__init__.py +9 -0
  188. polars/testing/asserts/frame.py +231 -0
  189. polars/testing/asserts/series.py +219 -0
  190. polars/testing/asserts/utils.py +12 -0
  191. polars/testing/parametric/__init__.py +33 -0
  192. polars/testing/parametric/profiles.py +107 -0
  193. polars/testing/parametric/strategies/__init__.py +22 -0
  194. polars/testing/parametric/strategies/_utils.py +14 -0
  195. polars/testing/parametric/strategies/core.py +615 -0
  196. polars/testing/parametric/strategies/data.py +452 -0
  197. polars/testing/parametric/strategies/dtype.py +436 -0
  198. polars/testing/parametric/strategies/legacy.py +169 -0
  199. polars/type_aliases.py +24 -0
  200. polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
  202. polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
  203. polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
@@ -0,0 +1,697 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import ast
5
+ import contextlib
6
+ from _ast import GtE, Lt, LtE
7
+ from ast import (
8
+ Attribute,
9
+ BinOp,
10
+ BitAnd,
11
+ BitOr,
12
+ Call,
13
+ Compare,
14
+ Constant,
15
+ Eq,
16
+ Gt,
17
+ Invert,
18
+ List,
19
+ Name,
20
+ UnaryOp,
21
+ )
22
+ from dataclasses import dataclass
23
+ from functools import cache, singledispatch
24
+ from typing import TYPE_CHECKING, Any, Callable
25
+
26
+ import polars._reexport as pl
27
+ from polars._utils.convert import to_py_date, to_py_datetime
28
+ from polars._utils.logging import eprint
29
+ from polars._utils.wrap import wrap_s
30
+ from polars.exceptions import ComputeError
31
+
32
+ if TYPE_CHECKING:
33
+ from collections.abc import Sequence
34
+ from datetime import date, datetime
35
+
36
+ import pyiceberg
37
+ import pyiceberg.schema
38
+ from pyiceberg.manifest import DataFile
39
+ from pyiceberg.table import Table
40
+ from pyiceberg.types import IcebergType
41
+
42
+ from polars import DataFrame, Series
43
+ else:
44
+ from polars._dependencies import pyiceberg
45
+
46
+ _temporal_conversions: dict[str, Callable[..., datetime | date]] = {
47
+ "to_py_date": to_py_date,
48
+ "to_py_datetime": to_py_datetime,
49
+ }
50
+
51
+ ICEBERG_TIME_TO_NS: int = 1000
52
+
53
+
54
+ def _scan_pyarrow_dataset_impl(
55
+ tbl: Table,
56
+ with_columns: list[str] | None = None,
57
+ predicate: str | None = None,
58
+ n_rows: int | None = None,
59
+ snapshot_id: int | None = None,
60
+ **kwargs: Any,
61
+ ) -> DataFrame | Series:
62
+ """
63
+ Take the projected columns and materialize an arrow table.
64
+
65
+ Parameters
66
+ ----------
67
+ tbl
68
+ pyarrow dataset
69
+ with_columns
70
+ Columns that are projected
71
+ predicate
72
+ pyarrow expression that can be evaluated with eval
73
+ n_rows:
74
+ Materialize only n rows from the arrow dataset.
75
+ snapshot_id:
76
+ The snapshot ID to scan from.
77
+ batch_size
78
+ The maximum row count for scanned pyarrow record batches.
79
+ kwargs:
80
+ For backward compatibility
81
+
82
+ Returns
83
+ -------
84
+ DataFrame
85
+ """
86
+ from polars import from_arrow
87
+
88
+ scan = tbl.scan(limit=n_rows, snapshot_id=snapshot_id)
89
+
90
+ if with_columns is not None:
91
+ scan = scan.select(*with_columns)
92
+
93
+ if predicate is not None:
94
+ try:
95
+ expr_ast = _to_ast(predicate)
96
+ pyiceberg_expr = _convert_predicate(expr_ast)
97
+ except ValueError as e:
98
+ msg = f"Could not convert predicate to PyIceberg: {predicate}"
99
+ raise ValueError(msg) from e
100
+
101
+ scan = scan.filter(pyiceberg_expr)
102
+
103
+ return from_arrow(scan.to_arrow())
104
+
105
+
106
+ def _to_ast(expr: str) -> ast.expr:
107
+ """
108
+ Converts a Python string to an AST.
109
+
110
+ This will take the Python Arrow expression (as a string), and it will
111
+ be converted into a Python AST that can be traversed to convert it to a PyIceberg
112
+ expression.
113
+
114
+ The reason to convert it to an AST is because the PyArrow expression
115
+ itself doesn't have any methods/properties to traverse the expression.
116
+ We need this to convert it into a PyIceberg expression.
117
+
118
+ Parameters
119
+ ----------
120
+ expr
121
+ The string expression
122
+
123
+ Returns
124
+ -------
125
+ The AST representing the Arrow expression
126
+ """
127
+ return ast.parse(expr, mode="eval").body
128
+
129
+
130
+ @singledispatch
131
+ def _convert_predicate(a: Any) -> Any:
132
+ """Walks the AST to convert the PyArrow expression to a PyIceberg expression."""
133
+ msg = f"Unexpected symbol: {a}"
134
+ raise ValueError(msg)
135
+
136
+
137
+ @_convert_predicate.register(Constant)
138
+ def _(a: Constant) -> Any:
139
+ return a.value
140
+
141
+
142
+ @_convert_predicate.register(Name)
143
+ def _(a: Name) -> Any:
144
+ return a.id
145
+
146
+
147
+ @_convert_predicate.register(UnaryOp)
148
+ def _(a: UnaryOp) -> Any:
149
+ if isinstance(a.op, Invert):
150
+ return pyiceberg.expressions.Not(_convert_predicate(a.operand))
151
+ else:
152
+ msg = f"Unexpected UnaryOp: {a}"
153
+ raise TypeError(msg)
154
+
155
+
156
+ @_convert_predicate.register(Call)
157
+ def _(a: Call) -> Any:
158
+ args = [_convert_predicate(arg) for arg in a.args]
159
+ f = _convert_predicate(a.func)
160
+ if f == "field":
161
+ return args
162
+ elif f == "scalar":
163
+ return args[0]
164
+ elif f in _temporal_conversions:
165
+ # convert from polars-native i64 to ISO8601 string
166
+ return _temporal_conversions[f](*args).isoformat()
167
+ else:
168
+ ref = _convert_predicate(a.func.value)[0] # type: ignore[attr-defined]
169
+ if f == "isin":
170
+ return pyiceberg.expressions.In(ref, args[0])
171
+ elif f == "is_null":
172
+ return pyiceberg.expressions.IsNull(ref)
173
+ elif f == "is_nan":
174
+ return pyiceberg.expressions.IsNaN(ref)
175
+
176
+ msg = f"Unknown call: {f!r}"
177
+ raise ValueError(msg)
178
+
179
+
180
+ @_convert_predicate.register(Attribute)
181
+ def _(a: Attribute) -> Any:
182
+ return a.attr
183
+
184
+
185
+ @_convert_predicate.register(BinOp)
186
+ def _(a: BinOp) -> Any:
187
+ lhs = _convert_predicate(a.left)
188
+ rhs = _convert_predicate(a.right)
189
+
190
+ op = a.op
191
+ if isinstance(op, BitAnd):
192
+ return pyiceberg.expressions.And(lhs, rhs)
193
+ if isinstance(op, BitOr):
194
+ return pyiceberg.expressions.Or(lhs, rhs)
195
+ else:
196
+ msg = f"Unknown: {lhs} {op} {rhs}"
197
+ raise TypeError(msg)
198
+
199
+
200
+ @_convert_predicate.register(Compare)
201
+ def _(a: Compare) -> Any:
202
+ op = a.ops[0]
203
+ lhs = _convert_predicate(a.left)[0]
204
+ rhs = _convert_predicate(a.comparators[0])
205
+
206
+ if isinstance(op, Gt):
207
+ return pyiceberg.expressions.GreaterThan(lhs, rhs)
208
+ if isinstance(op, GtE):
209
+ return pyiceberg.expressions.GreaterThanOrEqual(lhs, rhs)
210
+ if isinstance(op, Eq):
211
+ return pyiceberg.expressions.EqualTo(lhs, rhs)
212
+ if isinstance(op, Lt):
213
+ return pyiceberg.expressions.LessThan(lhs, rhs)
214
+ if isinstance(op, LtE):
215
+ return pyiceberg.expressions.LessThanOrEqual(lhs, rhs)
216
+ else:
217
+ msg = f"Unknown comparison: {op}"
218
+ raise TypeError(msg)
219
+
220
+
221
+ @_convert_predicate.register(List)
222
+ def _(a: List) -> Any:
223
+ return [_convert_predicate(e) for e in a.elts]
224
+
225
+
226
+ class IdentityTransformedPartitionValuesBuilder:
227
+ def __init__(
228
+ self,
229
+ table: Table,
230
+ projected_schema: pyiceberg.schema.Schema,
231
+ ) -> None:
232
+ import pyiceberg.schema
233
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
234
+ from pyiceberg.transforms import IdentityTransform
235
+ from pyiceberg.types import (
236
+ DoubleType,
237
+ FloatType,
238
+ IntegerType,
239
+ LongType,
240
+ )
241
+
242
+ projected_ids: set[int] = projected_schema.field_ids
243
+
244
+ # {source_field_id: [values] | error_message}
245
+ self.partition_values: dict[int, list[Any] | str] = {}
246
+ # Logical types will have length-2 list [<constructor type>, <cast type>].
247
+ # E.g. for Datetime it will be [Int64, Datetime]
248
+ self.partition_values_dtypes: dict[int, pl.DataType] = {}
249
+
250
+ # {spec_id: [partition_value_index, source_field_id]}
251
+ self.partition_spec_id_to_identity_transforms: dict[
252
+ int, list[tuple[int, int]]
253
+ ] = {}
254
+
255
+ partition_specs = table.specs()
256
+
257
+ for spec_id, spec in partition_specs.items():
258
+ out = []
259
+
260
+ for field_index, field in enumerate(spec.fields):
261
+ if field.source_id in projected_ids and isinstance(
262
+ field.transform, IdentityTransform
263
+ ):
264
+ out.append((field_index, field.source_id))
265
+ self.partition_values[field.source_id] = []
266
+
267
+ self.partition_spec_id_to_identity_transforms[spec_id] = out
268
+
269
+ for field_id in self.partition_values:
270
+ projected_field = projected_schema.find_field(field_id)
271
+ projected_type = projected_field.field_type
272
+
273
+ _, output_dtype = pl.Schema(
274
+ schema_to_pyarrow(pyiceberg.schema.Schema(projected_field))
275
+ ).popitem()
276
+
277
+ self.partition_values_dtypes[field_id] = output_dtype
278
+
279
+ if not projected_type.is_primitive or output_dtype.is_nested():
280
+ self.partition_values[field_id] = (
281
+ f"non-primitive type: {projected_type = } {output_dtype = }"
282
+ )
283
+
284
+ for schema in table.schemas().values():
285
+ try:
286
+ type_this_schema = schema.find_field(field_id).field_type
287
+ except ValueError:
288
+ continue
289
+
290
+ if not (
291
+ projected_type == type_this_schema
292
+ or (
293
+ isinstance(projected_type, LongType)
294
+ and isinstance(type_this_schema, IntegerType)
295
+ )
296
+ or (
297
+ isinstance(projected_type, (DoubleType, FloatType))
298
+ and isinstance(type_this_schema, (DoubleType, FloatType))
299
+ )
300
+ ):
301
+ self.partition_values[field_id] = (
302
+ f"unsupported type change: from: {type_this_schema}, "
303
+ f"to: {projected_type}"
304
+ )
305
+
306
+ def push_partition_values(
307
+ self,
308
+ *,
309
+ current_index: int,
310
+ partition_spec_id: int,
311
+ partition_values: pyiceberg.typedef.Record,
312
+ ) -> None:
313
+ try:
314
+ identity_transforms = self.partition_spec_id_to_identity_transforms[
315
+ partition_spec_id
316
+ ]
317
+ except KeyError:
318
+ self.partition_values = {
319
+ k: f"partition spec ID not found: {partition_spec_id}"
320
+ for k in self.partition_values
321
+ }
322
+ return
323
+
324
+ for i, source_field_id in identity_transforms:
325
+ partition_value = partition_values[i]
326
+
327
+ if isinstance(values := self.partition_values[source_field_id], list):
328
+ # extend() - there can be gaps from partitions being
329
+ # added/removed/re-added
330
+ values.extend(None for _ in range(current_index - len(values)))
331
+ values.append(partition_value)
332
+
333
+ def finish(self) -> dict[int, pl.Series | str]:
334
+ from polars.datatypes import Date, Datetime, Duration, Int32, Int64, Time
335
+
336
+ out: dict[int, pl.Series | str] = {}
337
+
338
+ for field_id, v in self.partition_values.items():
339
+ if isinstance(v, str):
340
+ out[field_id] = v
341
+ else:
342
+ try:
343
+ output_dtype = self.partition_values_dtypes[field_id]
344
+
345
+ constructor_dtype = (
346
+ Int64
347
+ if isinstance(output_dtype, (Datetime, Duration, Time))
348
+ else Int32
349
+ if isinstance(output_dtype, Date)
350
+ else output_dtype
351
+ )
352
+
353
+ s = pl.Series(v, dtype=constructor_dtype)
354
+
355
+ assert not s.dtype.is_nested()
356
+
357
+ if isinstance(output_dtype, Time):
358
+ # Physical from PyIceberg is in microseconds, physical
359
+ # used by polars is in nanoseconds.
360
+ s = s * ICEBERG_TIME_TO_NS
361
+
362
+ s = s.cast(output_dtype)
363
+
364
+ out[field_id] = s
365
+
366
+ except Exception as e:
367
+ out[field_id] = f"failed to load partition values: {e}"
368
+
369
+ return out
370
+
371
+
372
+ class IcebergStatisticsLoader:
373
+ def __init__(
374
+ self,
375
+ table: Table,
376
+ projected_filter_schema: pyiceberg.schema.Schema,
377
+ ) -> None:
378
+ import pyiceberg.schema
379
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
380
+
381
+ import polars as pl
382
+ import polars._utils.logging
383
+
384
+ verbose = polars._utils.logging.verbose()
385
+
386
+ self.file_column_statistics: dict[int, IcebergColumnStatisticsLoader] = {}
387
+ self.load_as_empty_statistics: list[str] = []
388
+ self.file_lengths: list[int] = []
389
+ self.projected_filter_schema = projected_filter_schema
390
+
391
+ for field in projected_filter_schema.fields:
392
+ field_all_types = set()
393
+
394
+ for schema in table.schemas().values():
395
+ with contextlib.suppress(ValueError):
396
+ field_all_types.add(schema.find_field(field.field_id).field_type)
397
+
398
+ _, field_polars_dtype = pl.Schema(
399
+ schema_to_pyarrow(pyiceberg.schema.Schema(field))
400
+ ).popitem()
401
+
402
+ load_from_bytes_impl = LoadFromBytesImpl.init_for_field_type(
403
+ field.field_type,
404
+ field_all_types,
405
+ field_polars_dtype,
406
+ )
407
+
408
+ if verbose:
409
+ _load_from_bytes_impl = (
410
+ type(load_from_bytes_impl).__name__
411
+ if load_from_bytes_impl is not None
412
+ else "None"
413
+ )
414
+
415
+ eprint(
416
+ "IcebergStatisticsLoader: "
417
+ f"{field.name = }, "
418
+ f"{field.field_id = }, "
419
+ f"{field.field_type = }, "
420
+ f"{field_all_types = }, "
421
+ f"{field_polars_dtype = }, "
422
+ f"{_load_from_bytes_impl = }"
423
+ )
424
+
425
+ self.file_column_statistics[field.field_id] = IcebergColumnStatisticsLoader(
426
+ field_id=field.field_id,
427
+ column_name=field.name,
428
+ column_dtype=field_polars_dtype,
429
+ load_from_bytes_impl=load_from_bytes_impl,
430
+ min_values=[],
431
+ max_values=[],
432
+ null_count=[],
433
+ )
434
+
435
+ def push_file_statistics(self, file: DataFile) -> None:
436
+ self.file_lengths.append(file.record_count)
437
+
438
+ for stats in self.file_column_statistics.values():
439
+ stats.push_file_statistics(file)
440
+
441
+ def finish(
442
+ self,
443
+ expected_height: int,
444
+ identity_transformed_values: dict[int, pl.Series | str],
445
+ ) -> pl.DataFrame:
446
+ import polars as pl
447
+
448
+ out: list[pl.DataFrame] = [
449
+ pl.Series("len", self.file_lengths, dtype=pl.UInt32).to_frame()
450
+ ]
451
+
452
+ for field_id, stat_builder in self.file_column_statistics.items():
453
+ if (p := identity_transformed_values.get(field_id)) is not None:
454
+ if isinstance(p, str):
455
+ msg = f"statistics load failure for filter column: {p}"
456
+ raise ComputeError(msg)
457
+
458
+ column_stats_df = stat_builder.finish(expected_height, p)
459
+ out.append(column_stats_df)
460
+
461
+ return pl.concat(out, how="horizontal")
462
+
463
+
464
+ @dataclass
465
+ class IcebergColumnStatisticsLoader:
466
+ column_name: str
467
+ column_dtype: pl.DataType
468
+ field_id: int
469
+ load_from_bytes_impl: LoadFromBytesImpl | None
470
+ null_count: list[int | None]
471
+ min_values: list[bytes | None]
472
+ max_values: list[bytes | None]
473
+
474
+ def push_file_statistics(self, file: DataFile) -> None:
475
+ self.null_count.append(file.null_value_counts.get(self.field_id))
476
+
477
+ if self.load_from_bytes_impl is not None:
478
+ self.min_values.append(file.lower_bounds.get(self.field_id))
479
+ self.max_values.append(file.upper_bounds.get(self.field_id))
480
+
481
+ def finish(
482
+ self,
483
+ expected_height: int,
484
+ identity_transformed_values: pl.Series | None,
485
+ ) -> pl.DataFrame:
486
+ import polars as pl
487
+
488
+ c = self.column_name
489
+ assert len(self.null_count) == expected_height
490
+
491
+ out = pl.Series(f"{c}_nc", self.null_count, dtype=pl.UInt32).to_frame()
492
+
493
+ if self.load_from_bytes_impl is None:
494
+ s = (
495
+ identity_transformed_values
496
+ if identity_transformed_values is not None
497
+ else pl.repeat(None, expected_height, dtype=self.column_dtype)
498
+ )
499
+
500
+ return out.with_columns(s.alias(f"{c}_min"), s.alias(f"{c}_max"))
501
+
502
+ assert len(self.min_values) == expected_height
503
+ assert len(self.max_values) == expected_height
504
+
505
+ if self.column_dtype.is_nested():
506
+ raise NotImplementedError
507
+
508
+ min_values = self.load_from_bytes_impl.load_from_bytes(self.min_values)
509
+ max_values = self.load_from_bytes_impl.load_from_bytes(self.max_values)
510
+
511
+ if identity_transformed_values is not None:
512
+ assert identity_transformed_values.dtype == self.column_dtype
513
+
514
+ identity_transformed_values = identity_transformed_values.extend_constant(
515
+ None, expected_height - identity_transformed_values.len()
516
+ )
517
+
518
+ min_values = identity_transformed_values.fill_null(min_values)
519
+ max_values = identity_transformed_values.fill_null(max_values)
520
+
521
+ return out.with_columns(
522
+ min_values.alias(f"{c}_min"), max_values.alias(f"{c}_max")
523
+ )
524
+
525
+
526
+ # Lazy init instead of global const as PyIceberg is an optional dependency
527
+ @cache
528
+ def _bytes_loader_lookup() -> dict[
529
+ type[IcebergType],
530
+ tuple[type[LoadFromBytesImpl], type[IcebergType] | Sequence[type[IcebergType]]],
531
+ ]:
532
+ from pyiceberg.types import (
533
+ BinaryType,
534
+ BooleanType,
535
+ DateType,
536
+ DecimalType,
537
+ FixedType,
538
+ IntegerType,
539
+ LongType,
540
+ StringType,
541
+ TimestampType,
542
+ TimestamptzType,
543
+ TimeType,
544
+ )
545
+
546
+ # TODO: Float statistics
547
+ return {
548
+ BooleanType: (LoadBooleanFromBytes, BooleanType),
549
+ DateType: (LoadDateFromBytes, DateType),
550
+ TimeType: (LoadTimeFromBytes, TimeType),
551
+ TimestampType: (LoadTimestampFromBytes, TimestampType),
552
+ TimestamptzType: (LoadTimestamptzFromBytes, TimestamptzType),
553
+ IntegerType: (LoadInt32FromBytes, IntegerType),
554
+ LongType: (LoadInt64FromBytes, (LongType, IntegerType)),
555
+ StringType: (LoadStringFromBytes, StringType),
556
+ BinaryType: (LoadBinaryFromBytes, BinaryType),
557
+ DecimalType: (LoadDecimalFromBytes, DecimalType),
558
+ FixedType: (LoadFixedFromBytes, FixedType),
559
+ }
560
+
561
+
562
+ class LoadFromBytesImpl(abc.ABC):
563
+ def __init__(self, polars_dtype: pl.DataType) -> None:
564
+ self.polars_dtype = polars_dtype
565
+
566
+ @staticmethod
567
+ def init_for_field_type(
568
+ current_field_type: IcebergType,
569
+ # All types that this field ID has been set to across schema changes.
570
+ all_field_types: set[IcebergType],
571
+ field_polars_dtype: pl.DataType,
572
+ ) -> LoadFromBytesImpl | None:
573
+ if (v := _bytes_loader_lookup().get(type(current_field_type))) is None:
574
+ return None
575
+
576
+ loader_impl, allowed_field_types = v
577
+
578
+ return (
579
+ loader_impl(field_polars_dtype)
580
+ if all(isinstance(x, allowed_field_types) for x in all_field_types) # type: ignore[arg-type]
581
+ else None
582
+ )
583
+
584
+ @abc.abstractmethod
585
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
586
+ """`bytes_values` should be of binary type."""
587
+
588
+
589
+ class LoadBinaryFromBytes(LoadFromBytesImpl):
590
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
591
+ import polars as pl
592
+
593
+ return pl.Series(byte_values, dtype=pl.Binary)
594
+
595
+
596
+ class LoadDateFromBytes(LoadFromBytesImpl):
597
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
598
+ import polars as pl
599
+
600
+ return (
601
+ pl.Series(byte_values, dtype=pl.Binary)
602
+ .bin.reinterpret(dtype=pl.Int32, endianness="little")
603
+ .cast(pl.Date)
604
+ )
605
+
606
+
607
+ class LoadTimeFromBytes(LoadFromBytesImpl):
608
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
609
+ import polars as pl
610
+
611
+ return (
612
+ pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
613
+ dtype=pl.Int64, endianness="little"
614
+ )
615
+ * ICEBERG_TIME_TO_NS
616
+ ).cast(pl.Time)
617
+
618
+
619
+ class LoadTimestampFromBytes(LoadFromBytesImpl):
620
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
621
+ import polars as pl
622
+
623
+ return (
624
+ pl.Series(byte_values, dtype=pl.Binary)
625
+ .bin.reinterpret(dtype=pl.Int64, endianness="little")
626
+ .cast(pl.Datetime("us"))
627
+ )
628
+
629
+
630
+ class LoadTimestamptzFromBytes(LoadFromBytesImpl):
631
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
632
+ import polars as pl
633
+
634
+ return (
635
+ pl.Series(byte_values, dtype=pl.Binary)
636
+ .bin.reinterpret(dtype=pl.Int64, endianness="little")
637
+ .cast(pl.Datetime("us", time_zone="UTC"))
638
+ )
639
+
640
+
641
+ class LoadBooleanFromBytes(LoadFromBytesImpl):
642
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
643
+ import polars as pl
644
+
645
+ return (
646
+ pl.Series(byte_values, dtype=pl.Binary)
647
+ .bin.reinterpret(dtype=pl.UInt8, endianness="little")
648
+ .cast(pl.Boolean)
649
+ )
650
+
651
+
652
+ class LoadDecimalFromBytes(LoadFromBytesImpl):
653
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
654
+ import polars as pl
655
+ from polars._plr import PySeries
656
+
657
+ dtype = self.polars_dtype
658
+ assert isinstance(dtype, pl.Decimal)
659
+ assert dtype.precision is not None
660
+
661
+ return wrap_s(
662
+ PySeries._import_decimal_from_iceberg_binary_repr(
663
+ bytes_list=byte_values,
664
+ precision=dtype.precision,
665
+ scale=dtype.scale,
666
+ )
667
+ )
668
+
669
+
670
+ class LoadFixedFromBytes(LoadBinaryFromBytes): ...
671
+
672
+
673
+ class LoadInt32FromBytes(LoadFromBytesImpl):
674
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
675
+ import polars as pl
676
+
677
+ return pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
678
+ dtype=pl.Int32, endianness="little"
679
+ )
680
+
681
+
682
+ class LoadInt64FromBytes(LoadFromBytesImpl):
683
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
684
+ import polars as pl
685
+
686
+ s = pl.Series(byte_values, dtype=pl.Binary)
687
+
688
+ return s.bin.reinterpret(dtype=pl.Int64, endianness="little").fill_null(
689
+ s.bin.reinterpret(dtype=pl.Int32, endianness="little").cast(pl.Int64)
690
+ )
691
+
692
+
693
+ class LoadStringFromBytes(LoadFromBytesImpl):
694
+ def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
695
+ import polars as pl
696
+
697
+ return pl.Series(byte_values, dtype=pl.Binary).cast(pl.String)