polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b5__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (204) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
  3. polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
  202. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
  204. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
@@ -1,697 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import abc
4
- import ast
5
- import contextlib
6
- from _ast import GtE, Lt, LtE
7
- from ast import (
8
- Attribute,
9
- BinOp,
10
- BitAnd,
11
- BitOr,
12
- Call,
13
- Compare,
14
- Constant,
15
- Eq,
16
- Gt,
17
- Invert,
18
- List,
19
- Name,
20
- UnaryOp,
21
- )
22
- from dataclasses import dataclass
23
- from functools import cache, singledispatch
24
- from typing import TYPE_CHECKING, Any, Callable
25
-
26
- import polars._reexport as pl
27
- from polars._utils.convert import to_py_date, to_py_datetime
28
- from polars._utils.logging import eprint
29
- from polars._utils.wrap import wrap_s
30
- from polars.exceptions import ComputeError
31
-
32
- if TYPE_CHECKING:
33
- from collections.abc import Sequence
34
- from datetime import date, datetime
35
-
36
- import pyiceberg
37
- import pyiceberg.schema
38
- from pyiceberg.manifest import DataFile
39
- from pyiceberg.table import Table
40
- from pyiceberg.types import IcebergType
41
-
42
- from polars import DataFrame, Series
43
- else:
44
- from polars._dependencies import pyiceberg
45
-
46
- _temporal_conversions: dict[str, Callable[..., datetime | date]] = {
47
- "to_py_date": to_py_date,
48
- "to_py_datetime": to_py_datetime,
49
- }
50
-
51
- ICEBERG_TIME_TO_NS: int = 1000
52
-
53
-
54
- def _scan_pyarrow_dataset_impl(
55
- tbl: Table,
56
- with_columns: list[str] | None = None,
57
- predicate: str | None = None,
58
- n_rows: int | None = None,
59
- snapshot_id: int | None = None,
60
- **kwargs: Any,
61
- ) -> DataFrame | Series:
62
- """
63
- Take the projected columns and materialize an arrow table.
64
-
65
- Parameters
66
- ----------
67
- tbl
68
- pyarrow dataset
69
- with_columns
70
- Columns that are projected
71
- predicate
72
- pyarrow expression that can be evaluated with eval
73
- n_rows:
74
- Materialize only n rows from the arrow dataset.
75
- snapshot_id:
76
- The snapshot ID to scan from.
77
- batch_size
78
- The maximum row count for scanned pyarrow record batches.
79
- kwargs:
80
- For backward compatibility
81
-
82
- Returns
83
- -------
84
- DataFrame
85
- """
86
- from polars import from_arrow
87
-
88
- scan = tbl.scan(limit=n_rows, snapshot_id=snapshot_id)
89
-
90
- if with_columns is not None:
91
- scan = scan.select(*with_columns)
92
-
93
- if predicate is not None:
94
- try:
95
- expr_ast = _to_ast(predicate)
96
- pyiceberg_expr = _convert_predicate(expr_ast)
97
- except ValueError as e:
98
- msg = f"Could not convert predicate to PyIceberg: {predicate}"
99
- raise ValueError(msg) from e
100
-
101
- scan = scan.filter(pyiceberg_expr)
102
-
103
- return from_arrow(scan.to_arrow())
104
-
105
-
106
- def _to_ast(expr: str) -> ast.expr:
107
- """
108
- Converts a Python string to an AST.
109
-
110
- This will take the Python Arrow expression (as a string), and it will
111
- be converted into a Python AST that can be traversed to convert it to a PyIceberg
112
- expression.
113
-
114
- The reason to convert it to an AST is because the PyArrow expression
115
- itself doesn't have any methods/properties to traverse the expression.
116
- We need this to convert it into a PyIceberg expression.
117
-
118
- Parameters
119
- ----------
120
- expr
121
- The string expression
122
-
123
- Returns
124
- -------
125
- The AST representing the Arrow expression
126
- """
127
- return ast.parse(expr, mode="eval").body
128
-
129
-
130
- @singledispatch
131
- def _convert_predicate(a: Any) -> Any:
132
- """Walks the AST to convert the PyArrow expression to a PyIceberg expression."""
133
- msg = f"Unexpected symbol: {a}"
134
- raise ValueError(msg)
135
-
136
-
137
- @_convert_predicate.register(Constant)
138
- def _(a: Constant) -> Any:
139
- return a.value
140
-
141
-
142
- @_convert_predicate.register(Name)
143
- def _(a: Name) -> Any:
144
- return a.id
145
-
146
-
147
- @_convert_predicate.register(UnaryOp)
148
- def _(a: UnaryOp) -> Any:
149
- if isinstance(a.op, Invert):
150
- return pyiceberg.expressions.Not(_convert_predicate(a.operand))
151
- else:
152
- msg = f"Unexpected UnaryOp: {a}"
153
- raise TypeError(msg)
154
-
155
-
156
- @_convert_predicate.register(Call)
157
- def _(a: Call) -> Any:
158
- args = [_convert_predicate(arg) for arg in a.args]
159
- f = _convert_predicate(a.func)
160
- if f == "field":
161
- return args
162
- elif f == "scalar":
163
- return args[0]
164
- elif f in _temporal_conversions:
165
- # convert from polars-native i64 to ISO8601 string
166
- return _temporal_conversions[f](*args).isoformat()
167
- else:
168
- ref = _convert_predicate(a.func.value)[0] # type: ignore[attr-defined]
169
- if f == "isin":
170
- return pyiceberg.expressions.In(ref, args[0])
171
- elif f == "is_null":
172
- return pyiceberg.expressions.IsNull(ref)
173
- elif f == "is_nan":
174
- return pyiceberg.expressions.IsNaN(ref)
175
-
176
- msg = f"Unknown call: {f!r}"
177
- raise ValueError(msg)
178
-
179
-
180
- @_convert_predicate.register(Attribute)
181
- def _(a: Attribute) -> Any:
182
- return a.attr
183
-
184
-
185
- @_convert_predicate.register(BinOp)
186
- def _(a: BinOp) -> Any:
187
- lhs = _convert_predicate(a.left)
188
- rhs = _convert_predicate(a.right)
189
-
190
- op = a.op
191
- if isinstance(op, BitAnd):
192
- return pyiceberg.expressions.And(lhs, rhs)
193
- if isinstance(op, BitOr):
194
- return pyiceberg.expressions.Or(lhs, rhs)
195
- else:
196
- msg = f"Unknown: {lhs} {op} {rhs}"
197
- raise TypeError(msg)
198
-
199
-
200
- @_convert_predicate.register(Compare)
201
- def _(a: Compare) -> Any:
202
- op = a.ops[0]
203
- lhs = _convert_predicate(a.left)[0]
204
- rhs = _convert_predicate(a.comparators[0])
205
-
206
- if isinstance(op, Gt):
207
- return pyiceberg.expressions.GreaterThan(lhs, rhs)
208
- if isinstance(op, GtE):
209
- return pyiceberg.expressions.GreaterThanOrEqual(lhs, rhs)
210
- if isinstance(op, Eq):
211
- return pyiceberg.expressions.EqualTo(lhs, rhs)
212
- if isinstance(op, Lt):
213
- return pyiceberg.expressions.LessThan(lhs, rhs)
214
- if isinstance(op, LtE):
215
- return pyiceberg.expressions.LessThanOrEqual(lhs, rhs)
216
- else:
217
- msg = f"Unknown comparison: {op}"
218
- raise TypeError(msg)
219
-
220
-
221
- @_convert_predicate.register(List)
222
- def _(a: List) -> Any:
223
- return [_convert_predicate(e) for e in a.elts]
224
-
225
-
226
- class IdentityTransformedPartitionValuesBuilder:
227
- def __init__(
228
- self,
229
- table: Table,
230
- projected_schema: pyiceberg.schema.Schema,
231
- ) -> None:
232
- import pyiceberg.schema
233
- from pyiceberg.io.pyarrow import schema_to_pyarrow
234
- from pyiceberg.transforms import IdentityTransform
235
- from pyiceberg.types import (
236
- DoubleType,
237
- FloatType,
238
- IntegerType,
239
- LongType,
240
- )
241
-
242
- projected_ids: set[int] = projected_schema.field_ids
243
-
244
- # {source_field_id: [values] | error_message}
245
- self.partition_values: dict[int, list[Any] | str] = {}
246
- # Logical types will have length-2 list [<constructor type>, <cast type>].
247
- # E.g. for Datetime it will be [Int64, Datetime]
248
- self.partition_values_dtypes: dict[int, pl.DataType] = {}
249
-
250
- # {spec_id: [partition_value_index, source_field_id]}
251
- self.partition_spec_id_to_identity_transforms: dict[
252
- int, list[tuple[int, int]]
253
- ] = {}
254
-
255
- partition_specs = table.specs()
256
-
257
- for spec_id, spec in partition_specs.items():
258
- out = []
259
-
260
- for field_index, field in enumerate(spec.fields):
261
- if field.source_id in projected_ids and isinstance(
262
- field.transform, IdentityTransform
263
- ):
264
- out.append((field_index, field.source_id))
265
- self.partition_values[field.source_id] = []
266
-
267
- self.partition_spec_id_to_identity_transforms[spec_id] = out
268
-
269
- for field_id in self.partition_values:
270
- projected_field = projected_schema.find_field(field_id)
271
- projected_type = projected_field.field_type
272
-
273
- _, output_dtype = pl.Schema(
274
- schema_to_pyarrow(pyiceberg.schema.Schema(projected_field))
275
- ).popitem()
276
-
277
- self.partition_values_dtypes[field_id] = output_dtype
278
-
279
- if not projected_type.is_primitive or output_dtype.is_nested():
280
- self.partition_values[field_id] = (
281
- f"non-primitive type: {projected_type = } {output_dtype = }"
282
- )
283
-
284
- for schema in table.schemas().values():
285
- try:
286
- type_this_schema = schema.find_field(field_id).field_type
287
- except ValueError:
288
- continue
289
-
290
- if not (
291
- projected_type == type_this_schema
292
- or (
293
- isinstance(projected_type, LongType)
294
- and isinstance(type_this_schema, IntegerType)
295
- )
296
- or (
297
- isinstance(projected_type, (DoubleType, FloatType))
298
- and isinstance(type_this_schema, (DoubleType, FloatType))
299
- )
300
- ):
301
- self.partition_values[field_id] = (
302
- f"unsupported type change: from: {type_this_schema}, "
303
- f"to: {projected_type}"
304
- )
305
-
306
- def push_partition_values(
307
- self,
308
- *,
309
- current_index: int,
310
- partition_spec_id: int,
311
- partition_values: pyiceberg.typedef.Record,
312
- ) -> None:
313
- try:
314
- identity_transforms = self.partition_spec_id_to_identity_transforms[
315
- partition_spec_id
316
- ]
317
- except KeyError:
318
- self.partition_values = {
319
- k: f"partition spec ID not found: {partition_spec_id}"
320
- for k in self.partition_values
321
- }
322
- return
323
-
324
- for i, source_field_id in identity_transforms:
325
- partition_value = partition_values[i]
326
-
327
- if isinstance(values := self.partition_values[source_field_id], list):
328
- # extend() - there can be gaps from partitions being
329
- # added/removed/re-added
330
- values.extend(None for _ in range(current_index - len(values)))
331
- values.append(partition_value)
332
-
333
- def finish(self) -> dict[int, pl.Series | str]:
334
- from polars.datatypes import Date, Datetime, Duration, Int32, Int64, Time
335
-
336
- out: dict[int, pl.Series | str] = {}
337
-
338
- for field_id, v in self.partition_values.items():
339
- if isinstance(v, str):
340
- out[field_id] = v
341
- else:
342
- try:
343
- output_dtype = self.partition_values_dtypes[field_id]
344
-
345
- constructor_dtype = (
346
- Int64
347
- if isinstance(output_dtype, (Datetime, Duration, Time))
348
- else Int32
349
- if isinstance(output_dtype, Date)
350
- else output_dtype
351
- )
352
-
353
- s = pl.Series(v, dtype=constructor_dtype)
354
-
355
- assert not s.dtype.is_nested()
356
-
357
- if isinstance(output_dtype, Time):
358
- # Physical from PyIceberg is in microseconds, physical
359
- # used by polars is in nanoseconds.
360
- s = s * ICEBERG_TIME_TO_NS
361
-
362
- s = s.cast(output_dtype)
363
-
364
- out[field_id] = s
365
-
366
- except Exception as e:
367
- out[field_id] = f"failed to load partition values: {e}"
368
-
369
- return out
370
-
371
-
372
- class IcebergStatisticsLoader:
373
- def __init__(
374
- self,
375
- table: Table,
376
- projected_filter_schema: pyiceberg.schema.Schema,
377
- ) -> None:
378
- import pyiceberg.schema
379
- from pyiceberg.io.pyarrow import schema_to_pyarrow
380
-
381
- import polars as pl
382
- import polars._utils.logging
383
-
384
- verbose = polars._utils.logging.verbose()
385
-
386
- self.file_column_statistics: dict[int, IcebergColumnStatisticsLoader] = {}
387
- self.load_as_empty_statistics: list[str] = []
388
- self.file_lengths: list[int] = []
389
- self.projected_filter_schema = projected_filter_schema
390
-
391
- for field in projected_filter_schema.fields:
392
- field_all_types = set()
393
-
394
- for schema in table.schemas().values():
395
- with contextlib.suppress(ValueError):
396
- field_all_types.add(schema.find_field(field.field_id).field_type)
397
-
398
- _, field_polars_dtype = pl.Schema(
399
- schema_to_pyarrow(pyiceberg.schema.Schema(field))
400
- ).popitem()
401
-
402
- load_from_bytes_impl = LoadFromBytesImpl.init_for_field_type(
403
- field.field_type,
404
- field_all_types,
405
- field_polars_dtype,
406
- )
407
-
408
- if verbose:
409
- _load_from_bytes_impl = (
410
- type(load_from_bytes_impl).__name__
411
- if load_from_bytes_impl is not None
412
- else "None"
413
- )
414
-
415
- eprint(
416
- "IcebergStatisticsLoader: "
417
- f"{field.name = }, "
418
- f"{field.field_id = }, "
419
- f"{field.field_type = }, "
420
- f"{field_all_types = }, "
421
- f"{field_polars_dtype = }, "
422
- f"{_load_from_bytes_impl = }"
423
- )
424
-
425
- self.file_column_statistics[field.field_id] = IcebergColumnStatisticsLoader(
426
- field_id=field.field_id,
427
- column_name=field.name,
428
- column_dtype=field_polars_dtype,
429
- load_from_bytes_impl=load_from_bytes_impl,
430
- min_values=[],
431
- max_values=[],
432
- null_count=[],
433
- )
434
-
435
- def push_file_statistics(self, file: DataFile) -> None:
436
- self.file_lengths.append(file.record_count)
437
-
438
- for stats in self.file_column_statistics.values():
439
- stats.push_file_statistics(file)
440
-
441
- def finish(
442
- self,
443
- expected_height: int,
444
- identity_transformed_values: dict[int, pl.Series | str],
445
- ) -> pl.DataFrame:
446
- import polars as pl
447
-
448
- out: list[pl.DataFrame] = [
449
- pl.Series("len", self.file_lengths, dtype=pl.UInt32).to_frame()
450
- ]
451
-
452
- for field_id, stat_builder in self.file_column_statistics.items():
453
- if (p := identity_transformed_values.get(field_id)) is not None:
454
- if isinstance(p, str):
455
- msg = f"statistics load failure for filter column: {p}"
456
- raise ComputeError(msg)
457
-
458
- column_stats_df = stat_builder.finish(expected_height, p)
459
- out.append(column_stats_df)
460
-
461
- return pl.concat(out, how="horizontal")
462
-
463
-
464
- @dataclass
465
- class IcebergColumnStatisticsLoader:
466
- column_name: str
467
- column_dtype: pl.DataType
468
- field_id: int
469
- load_from_bytes_impl: LoadFromBytesImpl | None
470
- null_count: list[int | None]
471
- min_values: list[bytes | None]
472
- max_values: list[bytes | None]
473
-
474
- def push_file_statistics(self, file: DataFile) -> None:
475
- self.null_count.append(file.null_value_counts.get(self.field_id))
476
-
477
- if self.load_from_bytes_impl is not None:
478
- self.min_values.append(file.lower_bounds.get(self.field_id))
479
- self.max_values.append(file.upper_bounds.get(self.field_id))
480
-
481
- def finish(
482
- self,
483
- expected_height: int,
484
- identity_transformed_values: pl.Series | None,
485
- ) -> pl.DataFrame:
486
- import polars as pl
487
-
488
- c = self.column_name
489
- assert len(self.null_count) == expected_height
490
-
491
- out = pl.Series(f"{c}_nc", self.null_count, dtype=pl.UInt32).to_frame()
492
-
493
- if self.load_from_bytes_impl is None:
494
- s = (
495
- identity_transformed_values
496
- if identity_transformed_values is not None
497
- else pl.repeat(None, expected_height, dtype=self.column_dtype)
498
- )
499
-
500
- return out.with_columns(s.alias(f"{c}_min"), s.alias(f"{c}_max"))
501
-
502
- assert len(self.min_values) == expected_height
503
- assert len(self.max_values) == expected_height
504
-
505
- if self.column_dtype.is_nested():
506
- raise NotImplementedError
507
-
508
- min_values = self.load_from_bytes_impl.load_from_bytes(self.min_values)
509
- max_values = self.load_from_bytes_impl.load_from_bytes(self.max_values)
510
-
511
- if identity_transformed_values is not None:
512
- assert identity_transformed_values.dtype == self.column_dtype
513
-
514
- identity_transformed_values = identity_transformed_values.extend_constant(
515
- None, expected_height - identity_transformed_values.len()
516
- )
517
-
518
- min_values = identity_transformed_values.fill_null(min_values)
519
- max_values = identity_transformed_values.fill_null(max_values)
520
-
521
- return out.with_columns(
522
- min_values.alias(f"{c}_min"), max_values.alias(f"{c}_max")
523
- )
524
-
525
-
526
- # Lazy init instead of global const as PyIceberg is an optional dependency
527
- @cache
528
- def _bytes_loader_lookup() -> dict[
529
- type[IcebergType],
530
- tuple[type[LoadFromBytesImpl], type[IcebergType] | Sequence[type[IcebergType]]],
531
- ]:
532
- from pyiceberg.types import (
533
- BinaryType,
534
- BooleanType,
535
- DateType,
536
- DecimalType,
537
- FixedType,
538
- IntegerType,
539
- LongType,
540
- StringType,
541
- TimestampType,
542
- TimestamptzType,
543
- TimeType,
544
- )
545
-
546
- # TODO: Float statistics
547
- return {
548
- BooleanType: (LoadBooleanFromBytes, BooleanType),
549
- DateType: (LoadDateFromBytes, DateType),
550
- TimeType: (LoadTimeFromBytes, TimeType),
551
- TimestampType: (LoadTimestampFromBytes, TimestampType),
552
- TimestamptzType: (LoadTimestamptzFromBytes, TimestamptzType),
553
- IntegerType: (LoadInt32FromBytes, IntegerType),
554
- LongType: (LoadInt64FromBytes, (LongType, IntegerType)),
555
- StringType: (LoadStringFromBytes, StringType),
556
- BinaryType: (LoadBinaryFromBytes, BinaryType),
557
- DecimalType: (LoadDecimalFromBytes, DecimalType),
558
- FixedType: (LoadFixedFromBytes, FixedType),
559
- }
560
-
561
-
562
- class LoadFromBytesImpl(abc.ABC):
563
- def __init__(self, polars_dtype: pl.DataType) -> None:
564
- self.polars_dtype = polars_dtype
565
-
566
- @staticmethod
567
- def init_for_field_type(
568
- current_field_type: IcebergType,
569
- # All types that this field ID has been set to across schema changes.
570
- all_field_types: set[IcebergType],
571
- field_polars_dtype: pl.DataType,
572
- ) -> LoadFromBytesImpl | None:
573
- if (v := _bytes_loader_lookup().get(type(current_field_type))) is None:
574
- return None
575
-
576
- loader_impl, allowed_field_types = v
577
-
578
- return (
579
- loader_impl(field_polars_dtype)
580
- if all(isinstance(x, allowed_field_types) for x in all_field_types) # type: ignore[arg-type]
581
- else None
582
- )
583
-
584
- @abc.abstractmethod
585
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
586
- """`bytes_values` should be of binary type."""
587
-
588
-
589
- class LoadBinaryFromBytes(LoadFromBytesImpl):
590
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
591
- import polars as pl
592
-
593
- return pl.Series(byte_values, dtype=pl.Binary)
594
-
595
-
596
- class LoadDateFromBytes(LoadFromBytesImpl):
597
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
598
- import polars as pl
599
-
600
- return (
601
- pl.Series(byte_values, dtype=pl.Binary)
602
- .bin.reinterpret(dtype=pl.Int32, endianness="little")
603
- .cast(pl.Date)
604
- )
605
-
606
-
607
- class LoadTimeFromBytes(LoadFromBytesImpl):
608
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
609
- import polars as pl
610
-
611
- return (
612
- pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
613
- dtype=pl.Int64, endianness="little"
614
- )
615
- * ICEBERG_TIME_TO_NS
616
- ).cast(pl.Time)
617
-
618
-
619
- class LoadTimestampFromBytes(LoadFromBytesImpl):
620
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
621
- import polars as pl
622
-
623
- return (
624
- pl.Series(byte_values, dtype=pl.Binary)
625
- .bin.reinterpret(dtype=pl.Int64, endianness="little")
626
- .cast(pl.Datetime("us"))
627
- )
628
-
629
-
630
- class LoadTimestamptzFromBytes(LoadFromBytesImpl):
631
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
632
- import polars as pl
633
-
634
- return (
635
- pl.Series(byte_values, dtype=pl.Binary)
636
- .bin.reinterpret(dtype=pl.Int64, endianness="little")
637
- .cast(pl.Datetime("us", time_zone="UTC"))
638
- )
639
-
640
-
641
- class LoadBooleanFromBytes(LoadFromBytesImpl):
642
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
643
- import polars as pl
644
-
645
- return (
646
- pl.Series(byte_values, dtype=pl.Binary)
647
- .bin.reinterpret(dtype=pl.UInt8, endianness="little")
648
- .cast(pl.Boolean)
649
- )
650
-
651
-
652
- class LoadDecimalFromBytes(LoadFromBytesImpl):
653
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
654
- import polars as pl
655
- from polars._plr import PySeries
656
-
657
- dtype = self.polars_dtype
658
- assert isinstance(dtype, pl.Decimal)
659
- assert dtype.precision is not None
660
-
661
- return wrap_s(
662
- PySeries._import_decimal_from_iceberg_binary_repr(
663
- bytes_list=byte_values,
664
- precision=dtype.precision,
665
- scale=dtype.scale,
666
- )
667
- )
668
-
669
-
670
- class LoadFixedFromBytes(LoadBinaryFromBytes): ...
671
-
672
-
673
- class LoadInt32FromBytes(LoadFromBytesImpl):
674
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
675
- import polars as pl
676
-
677
- return pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
678
- dtype=pl.Int32, endianness="little"
679
- )
680
-
681
-
682
- class LoadInt64FromBytes(LoadFromBytesImpl):
683
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
684
- import polars as pl
685
-
686
- s = pl.Series(byte_values, dtype=pl.Binary)
687
-
688
- return s.bin.reinterpret(dtype=pl.Int64, endianness="little").fill_null(
689
- s.bin.reinterpret(dtype=pl.Int32, endianness="little").cast(pl.Int64)
690
- )
691
-
692
-
693
- class LoadStringFromBytes(LoadFromBytesImpl):
694
- def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
695
- import polars as pl
696
-
697
- return pl.Series(byte_values, dtype=pl.Binary).cast(pl.String)