polars-runtime-compat 1.34.0b2__cp39-abi3-win_amd64.whl → 1.34.0b4__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
- polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -96
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/io/iceberg/_utils.py
DELETED
|
@@ -1,697 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import abc
|
|
4
|
-
import ast
|
|
5
|
-
import contextlib
|
|
6
|
-
from _ast import GtE, Lt, LtE
|
|
7
|
-
from ast import (
|
|
8
|
-
Attribute,
|
|
9
|
-
BinOp,
|
|
10
|
-
BitAnd,
|
|
11
|
-
BitOr,
|
|
12
|
-
Call,
|
|
13
|
-
Compare,
|
|
14
|
-
Constant,
|
|
15
|
-
Eq,
|
|
16
|
-
Gt,
|
|
17
|
-
Invert,
|
|
18
|
-
List,
|
|
19
|
-
Name,
|
|
20
|
-
UnaryOp,
|
|
21
|
-
)
|
|
22
|
-
from dataclasses import dataclass
|
|
23
|
-
from functools import cache, singledispatch
|
|
24
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
25
|
-
|
|
26
|
-
import polars._reexport as pl
|
|
27
|
-
from polars._utils.convert import to_py_date, to_py_datetime
|
|
28
|
-
from polars._utils.logging import eprint
|
|
29
|
-
from polars._utils.wrap import wrap_s
|
|
30
|
-
from polars.exceptions import ComputeError
|
|
31
|
-
|
|
32
|
-
if TYPE_CHECKING:
|
|
33
|
-
from collections.abc import Sequence
|
|
34
|
-
from datetime import date, datetime
|
|
35
|
-
|
|
36
|
-
import pyiceberg
|
|
37
|
-
import pyiceberg.schema
|
|
38
|
-
from pyiceberg.manifest import DataFile
|
|
39
|
-
from pyiceberg.table import Table
|
|
40
|
-
from pyiceberg.types import IcebergType
|
|
41
|
-
|
|
42
|
-
from polars import DataFrame, Series
|
|
43
|
-
else:
|
|
44
|
-
from polars._dependencies import pyiceberg
|
|
45
|
-
|
|
46
|
-
_temporal_conversions: dict[str, Callable[..., datetime | date]] = {
|
|
47
|
-
"to_py_date": to_py_date,
|
|
48
|
-
"to_py_datetime": to_py_datetime,
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
ICEBERG_TIME_TO_NS: int = 1000
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _scan_pyarrow_dataset_impl(
|
|
55
|
-
tbl: Table,
|
|
56
|
-
with_columns: list[str] | None = None,
|
|
57
|
-
predicate: str | None = None,
|
|
58
|
-
n_rows: int | None = None,
|
|
59
|
-
snapshot_id: int | None = None,
|
|
60
|
-
**kwargs: Any,
|
|
61
|
-
) -> DataFrame | Series:
|
|
62
|
-
"""
|
|
63
|
-
Take the projected columns and materialize an arrow table.
|
|
64
|
-
|
|
65
|
-
Parameters
|
|
66
|
-
----------
|
|
67
|
-
tbl
|
|
68
|
-
pyarrow dataset
|
|
69
|
-
with_columns
|
|
70
|
-
Columns that are projected
|
|
71
|
-
predicate
|
|
72
|
-
pyarrow expression that can be evaluated with eval
|
|
73
|
-
n_rows:
|
|
74
|
-
Materialize only n rows from the arrow dataset.
|
|
75
|
-
snapshot_id:
|
|
76
|
-
The snapshot ID to scan from.
|
|
77
|
-
batch_size
|
|
78
|
-
The maximum row count for scanned pyarrow record batches.
|
|
79
|
-
kwargs:
|
|
80
|
-
For backward compatibility
|
|
81
|
-
|
|
82
|
-
Returns
|
|
83
|
-
-------
|
|
84
|
-
DataFrame
|
|
85
|
-
"""
|
|
86
|
-
from polars import from_arrow
|
|
87
|
-
|
|
88
|
-
scan = tbl.scan(limit=n_rows, snapshot_id=snapshot_id)
|
|
89
|
-
|
|
90
|
-
if with_columns is not None:
|
|
91
|
-
scan = scan.select(*with_columns)
|
|
92
|
-
|
|
93
|
-
if predicate is not None:
|
|
94
|
-
try:
|
|
95
|
-
expr_ast = _to_ast(predicate)
|
|
96
|
-
pyiceberg_expr = _convert_predicate(expr_ast)
|
|
97
|
-
except ValueError as e:
|
|
98
|
-
msg = f"Could not convert predicate to PyIceberg: {predicate}"
|
|
99
|
-
raise ValueError(msg) from e
|
|
100
|
-
|
|
101
|
-
scan = scan.filter(pyiceberg_expr)
|
|
102
|
-
|
|
103
|
-
return from_arrow(scan.to_arrow())
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def _to_ast(expr: str) -> ast.expr:
|
|
107
|
-
"""
|
|
108
|
-
Converts a Python string to an AST.
|
|
109
|
-
|
|
110
|
-
This will take the Python Arrow expression (as a string), and it will
|
|
111
|
-
be converted into a Python AST that can be traversed to convert it to a PyIceberg
|
|
112
|
-
expression.
|
|
113
|
-
|
|
114
|
-
The reason to convert it to an AST is because the PyArrow expression
|
|
115
|
-
itself doesn't have any methods/properties to traverse the expression.
|
|
116
|
-
We need this to convert it into a PyIceberg expression.
|
|
117
|
-
|
|
118
|
-
Parameters
|
|
119
|
-
----------
|
|
120
|
-
expr
|
|
121
|
-
The string expression
|
|
122
|
-
|
|
123
|
-
Returns
|
|
124
|
-
-------
|
|
125
|
-
The AST representing the Arrow expression
|
|
126
|
-
"""
|
|
127
|
-
return ast.parse(expr, mode="eval").body
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@singledispatch
|
|
131
|
-
def _convert_predicate(a: Any) -> Any:
|
|
132
|
-
"""Walks the AST to convert the PyArrow expression to a PyIceberg expression."""
|
|
133
|
-
msg = f"Unexpected symbol: {a}"
|
|
134
|
-
raise ValueError(msg)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
@_convert_predicate.register(Constant)
|
|
138
|
-
def _(a: Constant) -> Any:
|
|
139
|
-
return a.value
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
@_convert_predicate.register(Name)
|
|
143
|
-
def _(a: Name) -> Any:
|
|
144
|
-
return a.id
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
@_convert_predicate.register(UnaryOp)
|
|
148
|
-
def _(a: UnaryOp) -> Any:
|
|
149
|
-
if isinstance(a.op, Invert):
|
|
150
|
-
return pyiceberg.expressions.Not(_convert_predicate(a.operand))
|
|
151
|
-
else:
|
|
152
|
-
msg = f"Unexpected UnaryOp: {a}"
|
|
153
|
-
raise TypeError(msg)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
@_convert_predicate.register(Call)
|
|
157
|
-
def _(a: Call) -> Any:
|
|
158
|
-
args = [_convert_predicate(arg) for arg in a.args]
|
|
159
|
-
f = _convert_predicate(a.func)
|
|
160
|
-
if f == "field":
|
|
161
|
-
return args
|
|
162
|
-
elif f == "scalar":
|
|
163
|
-
return args[0]
|
|
164
|
-
elif f in _temporal_conversions:
|
|
165
|
-
# convert from polars-native i64 to ISO8601 string
|
|
166
|
-
return _temporal_conversions[f](*args).isoformat()
|
|
167
|
-
else:
|
|
168
|
-
ref = _convert_predicate(a.func.value)[0] # type: ignore[attr-defined]
|
|
169
|
-
if f == "isin":
|
|
170
|
-
return pyiceberg.expressions.In(ref, args[0])
|
|
171
|
-
elif f == "is_null":
|
|
172
|
-
return pyiceberg.expressions.IsNull(ref)
|
|
173
|
-
elif f == "is_nan":
|
|
174
|
-
return pyiceberg.expressions.IsNaN(ref)
|
|
175
|
-
|
|
176
|
-
msg = f"Unknown call: {f!r}"
|
|
177
|
-
raise ValueError(msg)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
@_convert_predicate.register(Attribute)
|
|
181
|
-
def _(a: Attribute) -> Any:
|
|
182
|
-
return a.attr
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
@_convert_predicate.register(BinOp)
|
|
186
|
-
def _(a: BinOp) -> Any:
|
|
187
|
-
lhs = _convert_predicate(a.left)
|
|
188
|
-
rhs = _convert_predicate(a.right)
|
|
189
|
-
|
|
190
|
-
op = a.op
|
|
191
|
-
if isinstance(op, BitAnd):
|
|
192
|
-
return pyiceberg.expressions.And(lhs, rhs)
|
|
193
|
-
if isinstance(op, BitOr):
|
|
194
|
-
return pyiceberg.expressions.Or(lhs, rhs)
|
|
195
|
-
else:
|
|
196
|
-
msg = f"Unknown: {lhs} {op} {rhs}"
|
|
197
|
-
raise TypeError(msg)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
@_convert_predicate.register(Compare)
|
|
201
|
-
def _(a: Compare) -> Any:
|
|
202
|
-
op = a.ops[0]
|
|
203
|
-
lhs = _convert_predicate(a.left)[0]
|
|
204
|
-
rhs = _convert_predicate(a.comparators[0])
|
|
205
|
-
|
|
206
|
-
if isinstance(op, Gt):
|
|
207
|
-
return pyiceberg.expressions.GreaterThan(lhs, rhs)
|
|
208
|
-
if isinstance(op, GtE):
|
|
209
|
-
return pyiceberg.expressions.GreaterThanOrEqual(lhs, rhs)
|
|
210
|
-
if isinstance(op, Eq):
|
|
211
|
-
return pyiceberg.expressions.EqualTo(lhs, rhs)
|
|
212
|
-
if isinstance(op, Lt):
|
|
213
|
-
return pyiceberg.expressions.LessThan(lhs, rhs)
|
|
214
|
-
if isinstance(op, LtE):
|
|
215
|
-
return pyiceberg.expressions.LessThanOrEqual(lhs, rhs)
|
|
216
|
-
else:
|
|
217
|
-
msg = f"Unknown comparison: {op}"
|
|
218
|
-
raise TypeError(msg)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
@_convert_predicate.register(List)
|
|
222
|
-
def _(a: List) -> Any:
|
|
223
|
-
return [_convert_predicate(e) for e in a.elts]
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
class IdentityTransformedPartitionValuesBuilder:
|
|
227
|
-
def __init__(
|
|
228
|
-
self,
|
|
229
|
-
table: Table,
|
|
230
|
-
projected_schema: pyiceberg.schema.Schema,
|
|
231
|
-
) -> None:
|
|
232
|
-
import pyiceberg.schema
|
|
233
|
-
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
|
234
|
-
from pyiceberg.transforms import IdentityTransform
|
|
235
|
-
from pyiceberg.types import (
|
|
236
|
-
DoubleType,
|
|
237
|
-
FloatType,
|
|
238
|
-
IntegerType,
|
|
239
|
-
LongType,
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
projected_ids: set[int] = projected_schema.field_ids
|
|
243
|
-
|
|
244
|
-
# {source_field_id: [values] | error_message}
|
|
245
|
-
self.partition_values: dict[int, list[Any] | str] = {}
|
|
246
|
-
# Logical types will have length-2 list [<constructor type>, <cast type>].
|
|
247
|
-
# E.g. for Datetime it will be [Int64, Datetime]
|
|
248
|
-
self.partition_values_dtypes: dict[int, pl.DataType] = {}
|
|
249
|
-
|
|
250
|
-
# {spec_id: [partition_value_index, source_field_id]}
|
|
251
|
-
self.partition_spec_id_to_identity_transforms: dict[
|
|
252
|
-
int, list[tuple[int, int]]
|
|
253
|
-
] = {}
|
|
254
|
-
|
|
255
|
-
partition_specs = table.specs()
|
|
256
|
-
|
|
257
|
-
for spec_id, spec in partition_specs.items():
|
|
258
|
-
out = []
|
|
259
|
-
|
|
260
|
-
for field_index, field in enumerate(spec.fields):
|
|
261
|
-
if field.source_id in projected_ids and isinstance(
|
|
262
|
-
field.transform, IdentityTransform
|
|
263
|
-
):
|
|
264
|
-
out.append((field_index, field.source_id))
|
|
265
|
-
self.partition_values[field.source_id] = []
|
|
266
|
-
|
|
267
|
-
self.partition_spec_id_to_identity_transforms[spec_id] = out
|
|
268
|
-
|
|
269
|
-
for field_id in self.partition_values:
|
|
270
|
-
projected_field = projected_schema.find_field(field_id)
|
|
271
|
-
projected_type = projected_field.field_type
|
|
272
|
-
|
|
273
|
-
_, output_dtype = pl.Schema(
|
|
274
|
-
schema_to_pyarrow(pyiceberg.schema.Schema(projected_field))
|
|
275
|
-
).popitem()
|
|
276
|
-
|
|
277
|
-
self.partition_values_dtypes[field_id] = output_dtype
|
|
278
|
-
|
|
279
|
-
if not projected_type.is_primitive or output_dtype.is_nested():
|
|
280
|
-
self.partition_values[field_id] = (
|
|
281
|
-
f"non-primitive type: {projected_type = } {output_dtype = }"
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
for schema in table.schemas().values():
|
|
285
|
-
try:
|
|
286
|
-
type_this_schema = schema.find_field(field_id).field_type
|
|
287
|
-
except ValueError:
|
|
288
|
-
continue
|
|
289
|
-
|
|
290
|
-
if not (
|
|
291
|
-
projected_type == type_this_schema
|
|
292
|
-
or (
|
|
293
|
-
isinstance(projected_type, LongType)
|
|
294
|
-
and isinstance(type_this_schema, IntegerType)
|
|
295
|
-
)
|
|
296
|
-
or (
|
|
297
|
-
isinstance(projected_type, (DoubleType, FloatType))
|
|
298
|
-
and isinstance(type_this_schema, (DoubleType, FloatType))
|
|
299
|
-
)
|
|
300
|
-
):
|
|
301
|
-
self.partition_values[field_id] = (
|
|
302
|
-
f"unsupported type change: from: {type_this_schema}, "
|
|
303
|
-
f"to: {projected_type}"
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
def push_partition_values(
|
|
307
|
-
self,
|
|
308
|
-
*,
|
|
309
|
-
current_index: int,
|
|
310
|
-
partition_spec_id: int,
|
|
311
|
-
partition_values: pyiceberg.typedef.Record,
|
|
312
|
-
) -> None:
|
|
313
|
-
try:
|
|
314
|
-
identity_transforms = self.partition_spec_id_to_identity_transforms[
|
|
315
|
-
partition_spec_id
|
|
316
|
-
]
|
|
317
|
-
except KeyError:
|
|
318
|
-
self.partition_values = {
|
|
319
|
-
k: f"partition spec ID not found: {partition_spec_id}"
|
|
320
|
-
for k in self.partition_values
|
|
321
|
-
}
|
|
322
|
-
return
|
|
323
|
-
|
|
324
|
-
for i, source_field_id in identity_transforms:
|
|
325
|
-
partition_value = partition_values[i]
|
|
326
|
-
|
|
327
|
-
if isinstance(values := self.partition_values[source_field_id], list):
|
|
328
|
-
# extend() - there can be gaps from partitions being
|
|
329
|
-
# added/removed/re-added
|
|
330
|
-
values.extend(None for _ in range(current_index - len(values)))
|
|
331
|
-
values.append(partition_value)
|
|
332
|
-
|
|
333
|
-
def finish(self) -> dict[int, pl.Series | str]:
|
|
334
|
-
from polars.datatypes import Date, Datetime, Duration, Int32, Int64, Time
|
|
335
|
-
|
|
336
|
-
out: dict[int, pl.Series | str] = {}
|
|
337
|
-
|
|
338
|
-
for field_id, v in self.partition_values.items():
|
|
339
|
-
if isinstance(v, str):
|
|
340
|
-
out[field_id] = v
|
|
341
|
-
else:
|
|
342
|
-
try:
|
|
343
|
-
output_dtype = self.partition_values_dtypes[field_id]
|
|
344
|
-
|
|
345
|
-
constructor_dtype = (
|
|
346
|
-
Int64
|
|
347
|
-
if isinstance(output_dtype, (Datetime, Duration, Time))
|
|
348
|
-
else Int32
|
|
349
|
-
if isinstance(output_dtype, Date)
|
|
350
|
-
else output_dtype
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
s = pl.Series(v, dtype=constructor_dtype)
|
|
354
|
-
|
|
355
|
-
assert not s.dtype.is_nested()
|
|
356
|
-
|
|
357
|
-
if isinstance(output_dtype, Time):
|
|
358
|
-
# Physical from PyIceberg is in microseconds, physical
|
|
359
|
-
# used by polars is in nanoseconds.
|
|
360
|
-
s = s * ICEBERG_TIME_TO_NS
|
|
361
|
-
|
|
362
|
-
s = s.cast(output_dtype)
|
|
363
|
-
|
|
364
|
-
out[field_id] = s
|
|
365
|
-
|
|
366
|
-
except Exception as e:
|
|
367
|
-
out[field_id] = f"failed to load partition values: {e}"
|
|
368
|
-
|
|
369
|
-
return out
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
class IcebergStatisticsLoader:
|
|
373
|
-
def __init__(
|
|
374
|
-
self,
|
|
375
|
-
table: Table,
|
|
376
|
-
projected_filter_schema: pyiceberg.schema.Schema,
|
|
377
|
-
) -> None:
|
|
378
|
-
import pyiceberg.schema
|
|
379
|
-
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
|
380
|
-
|
|
381
|
-
import polars as pl
|
|
382
|
-
import polars._utils.logging
|
|
383
|
-
|
|
384
|
-
verbose = polars._utils.logging.verbose()
|
|
385
|
-
|
|
386
|
-
self.file_column_statistics: dict[int, IcebergColumnStatisticsLoader] = {}
|
|
387
|
-
self.load_as_empty_statistics: list[str] = []
|
|
388
|
-
self.file_lengths: list[int] = []
|
|
389
|
-
self.projected_filter_schema = projected_filter_schema
|
|
390
|
-
|
|
391
|
-
for field in projected_filter_schema.fields:
|
|
392
|
-
field_all_types = set()
|
|
393
|
-
|
|
394
|
-
for schema in table.schemas().values():
|
|
395
|
-
with contextlib.suppress(ValueError):
|
|
396
|
-
field_all_types.add(schema.find_field(field.field_id).field_type)
|
|
397
|
-
|
|
398
|
-
_, field_polars_dtype = pl.Schema(
|
|
399
|
-
schema_to_pyarrow(pyiceberg.schema.Schema(field))
|
|
400
|
-
).popitem()
|
|
401
|
-
|
|
402
|
-
load_from_bytes_impl = LoadFromBytesImpl.init_for_field_type(
|
|
403
|
-
field.field_type,
|
|
404
|
-
field_all_types,
|
|
405
|
-
field_polars_dtype,
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
if verbose:
|
|
409
|
-
_load_from_bytes_impl = (
|
|
410
|
-
type(load_from_bytes_impl).__name__
|
|
411
|
-
if load_from_bytes_impl is not None
|
|
412
|
-
else "None"
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
eprint(
|
|
416
|
-
"IcebergStatisticsLoader: "
|
|
417
|
-
f"{field.name = }, "
|
|
418
|
-
f"{field.field_id = }, "
|
|
419
|
-
f"{field.field_type = }, "
|
|
420
|
-
f"{field_all_types = }, "
|
|
421
|
-
f"{field_polars_dtype = }, "
|
|
422
|
-
f"{_load_from_bytes_impl = }"
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
self.file_column_statistics[field.field_id] = IcebergColumnStatisticsLoader(
|
|
426
|
-
field_id=field.field_id,
|
|
427
|
-
column_name=field.name,
|
|
428
|
-
column_dtype=field_polars_dtype,
|
|
429
|
-
load_from_bytes_impl=load_from_bytes_impl,
|
|
430
|
-
min_values=[],
|
|
431
|
-
max_values=[],
|
|
432
|
-
null_count=[],
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
def push_file_statistics(self, file: DataFile) -> None:
|
|
436
|
-
self.file_lengths.append(file.record_count)
|
|
437
|
-
|
|
438
|
-
for stats in self.file_column_statistics.values():
|
|
439
|
-
stats.push_file_statistics(file)
|
|
440
|
-
|
|
441
|
-
def finish(
|
|
442
|
-
self,
|
|
443
|
-
expected_height: int,
|
|
444
|
-
identity_transformed_values: dict[int, pl.Series | str],
|
|
445
|
-
) -> pl.DataFrame:
|
|
446
|
-
import polars as pl
|
|
447
|
-
|
|
448
|
-
out: list[pl.DataFrame] = [
|
|
449
|
-
pl.Series("len", self.file_lengths, dtype=pl.UInt32).to_frame()
|
|
450
|
-
]
|
|
451
|
-
|
|
452
|
-
for field_id, stat_builder in self.file_column_statistics.items():
|
|
453
|
-
if (p := identity_transformed_values.get(field_id)) is not None:
|
|
454
|
-
if isinstance(p, str):
|
|
455
|
-
msg = f"statistics load failure for filter column: {p}"
|
|
456
|
-
raise ComputeError(msg)
|
|
457
|
-
|
|
458
|
-
column_stats_df = stat_builder.finish(expected_height, p)
|
|
459
|
-
out.append(column_stats_df)
|
|
460
|
-
|
|
461
|
-
return pl.concat(out, how="horizontal")
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
@dataclass
|
|
465
|
-
class IcebergColumnStatisticsLoader:
|
|
466
|
-
column_name: str
|
|
467
|
-
column_dtype: pl.DataType
|
|
468
|
-
field_id: int
|
|
469
|
-
load_from_bytes_impl: LoadFromBytesImpl | None
|
|
470
|
-
null_count: list[int | None]
|
|
471
|
-
min_values: list[bytes | None]
|
|
472
|
-
max_values: list[bytes | None]
|
|
473
|
-
|
|
474
|
-
def push_file_statistics(self, file: DataFile) -> None:
|
|
475
|
-
self.null_count.append(file.null_value_counts.get(self.field_id))
|
|
476
|
-
|
|
477
|
-
if self.load_from_bytes_impl is not None:
|
|
478
|
-
self.min_values.append(file.lower_bounds.get(self.field_id))
|
|
479
|
-
self.max_values.append(file.upper_bounds.get(self.field_id))
|
|
480
|
-
|
|
481
|
-
def finish(
|
|
482
|
-
self,
|
|
483
|
-
expected_height: int,
|
|
484
|
-
identity_transformed_values: pl.Series | None,
|
|
485
|
-
) -> pl.DataFrame:
|
|
486
|
-
import polars as pl
|
|
487
|
-
|
|
488
|
-
c = self.column_name
|
|
489
|
-
assert len(self.null_count) == expected_height
|
|
490
|
-
|
|
491
|
-
out = pl.Series(f"{c}_nc", self.null_count, dtype=pl.UInt32).to_frame()
|
|
492
|
-
|
|
493
|
-
if self.load_from_bytes_impl is None:
|
|
494
|
-
s = (
|
|
495
|
-
identity_transformed_values
|
|
496
|
-
if identity_transformed_values is not None
|
|
497
|
-
else pl.repeat(None, expected_height, dtype=self.column_dtype)
|
|
498
|
-
)
|
|
499
|
-
|
|
500
|
-
return out.with_columns(s.alias(f"{c}_min"), s.alias(f"{c}_max"))
|
|
501
|
-
|
|
502
|
-
assert len(self.min_values) == expected_height
|
|
503
|
-
assert len(self.max_values) == expected_height
|
|
504
|
-
|
|
505
|
-
if self.column_dtype.is_nested():
|
|
506
|
-
raise NotImplementedError
|
|
507
|
-
|
|
508
|
-
min_values = self.load_from_bytes_impl.load_from_bytes(self.min_values)
|
|
509
|
-
max_values = self.load_from_bytes_impl.load_from_bytes(self.max_values)
|
|
510
|
-
|
|
511
|
-
if identity_transformed_values is not None:
|
|
512
|
-
assert identity_transformed_values.dtype == self.column_dtype
|
|
513
|
-
|
|
514
|
-
identity_transformed_values = identity_transformed_values.extend_constant(
|
|
515
|
-
None, expected_height - identity_transformed_values.len()
|
|
516
|
-
)
|
|
517
|
-
|
|
518
|
-
min_values = identity_transformed_values.fill_null(min_values)
|
|
519
|
-
max_values = identity_transformed_values.fill_null(max_values)
|
|
520
|
-
|
|
521
|
-
return out.with_columns(
|
|
522
|
-
min_values.alias(f"{c}_min"), max_values.alias(f"{c}_max")
|
|
523
|
-
)
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
# Lazy init instead of global const as PyIceberg is an optional dependency
|
|
527
|
-
@cache
|
|
528
|
-
def _bytes_loader_lookup() -> dict[
|
|
529
|
-
type[IcebergType],
|
|
530
|
-
tuple[type[LoadFromBytesImpl], type[IcebergType] | Sequence[type[IcebergType]]],
|
|
531
|
-
]:
|
|
532
|
-
from pyiceberg.types import (
|
|
533
|
-
BinaryType,
|
|
534
|
-
BooleanType,
|
|
535
|
-
DateType,
|
|
536
|
-
DecimalType,
|
|
537
|
-
FixedType,
|
|
538
|
-
IntegerType,
|
|
539
|
-
LongType,
|
|
540
|
-
StringType,
|
|
541
|
-
TimestampType,
|
|
542
|
-
TimestamptzType,
|
|
543
|
-
TimeType,
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
# TODO: Float statistics
|
|
547
|
-
return {
|
|
548
|
-
BooleanType: (LoadBooleanFromBytes, BooleanType),
|
|
549
|
-
DateType: (LoadDateFromBytes, DateType),
|
|
550
|
-
TimeType: (LoadTimeFromBytes, TimeType),
|
|
551
|
-
TimestampType: (LoadTimestampFromBytes, TimestampType),
|
|
552
|
-
TimestamptzType: (LoadTimestamptzFromBytes, TimestamptzType),
|
|
553
|
-
IntegerType: (LoadInt32FromBytes, IntegerType),
|
|
554
|
-
LongType: (LoadInt64FromBytes, (LongType, IntegerType)),
|
|
555
|
-
StringType: (LoadStringFromBytes, StringType),
|
|
556
|
-
BinaryType: (LoadBinaryFromBytes, BinaryType),
|
|
557
|
-
DecimalType: (LoadDecimalFromBytes, DecimalType),
|
|
558
|
-
FixedType: (LoadFixedFromBytes, FixedType),
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
class LoadFromBytesImpl(abc.ABC):
|
|
563
|
-
def __init__(self, polars_dtype: pl.DataType) -> None:
|
|
564
|
-
self.polars_dtype = polars_dtype
|
|
565
|
-
|
|
566
|
-
@staticmethod
|
|
567
|
-
def init_for_field_type(
|
|
568
|
-
current_field_type: IcebergType,
|
|
569
|
-
# All types that this field ID has been set to across schema changes.
|
|
570
|
-
all_field_types: set[IcebergType],
|
|
571
|
-
field_polars_dtype: pl.DataType,
|
|
572
|
-
) -> LoadFromBytesImpl | None:
|
|
573
|
-
if (v := _bytes_loader_lookup().get(type(current_field_type))) is None:
|
|
574
|
-
return None
|
|
575
|
-
|
|
576
|
-
loader_impl, allowed_field_types = v
|
|
577
|
-
|
|
578
|
-
return (
|
|
579
|
-
loader_impl(field_polars_dtype)
|
|
580
|
-
if all(isinstance(x, allowed_field_types) for x in all_field_types) # type: ignore[arg-type]
|
|
581
|
-
else None
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
@abc.abstractmethod
|
|
585
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
586
|
-
"""`bytes_values` should be of binary type."""
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
class LoadBinaryFromBytes(LoadFromBytesImpl):
|
|
590
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
591
|
-
import polars as pl
|
|
592
|
-
|
|
593
|
-
return pl.Series(byte_values, dtype=pl.Binary)
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
class LoadDateFromBytes(LoadFromBytesImpl):
|
|
597
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
598
|
-
import polars as pl
|
|
599
|
-
|
|
600
|
-
return (
|
|
601
|
-
pl.Series(byte_values, dtype=pl.Binary)
|
|
602
|
-
.bin.reinterpret(dtype=pl.Int32, endianness="little")
|
|
603
|
-
.cast(pl.Date)
|
|
604
|
-
)
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
class LoadTimeFromBytes(LoadFromBytesImpl):
|
|
608
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
609
|
-
import polars as pl
|
|
610
|
-
|
|
611
|
-
return (
|
|
612
|
-
pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
|
|
613
|
-
dtype=pl.Int64, endianness="little"
|
|
614
|
-
)
|
|
615
|
-
* ICEBERG_TIME_TO_NS
|
|
616
|
-
).cast(pl.Time)
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
class LoadTimestampFromBytes(LoadFromBytesImpl):
|
|
620
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
621
|
-
import polars as pl
|
|
622
|
-
|
|
623
|
-
return (
|
|
624
|
-
pl.Series(byte_values, dtype=pl.Binary)
|
|
625
|
-
.bin.reinterpret(dtype=pl.Int64, endianness="little")
|
|
626
|
-
.cast(pl.Datetime("us"))
|
|
627
|
-
)
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
class LoadTimestamptzFromBytes(LoadFromBytesImpl):
|
|
631
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
632
|
-
import polars as pl
|
|
633
|
-
|
|
634
|
-
return (
|
|
635
|
-
pl.Series(byte_values, dtype=pl.Binary)
|
|
636
|
-
.bin.reinterpret(dtype=pl.Int64, endianness="little")
|
|
637
|
-
.cast(pl.Datetime("us", time_zone="UTC"))
|
|
638
|
-
)
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
class LoadBooleanFromBytes(LoadFromBytesImpl):
|
|
642
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
643
|
-
import polars as pl
|
|
644
|
-
|
|
645
|
-
return (
|
|
646
|
-
pl.Series(byte_values, dtype=pl.Binary)
|
|
647
|
-
.bin.reinterpret(dtype=pl.UInt8, endianness="little")
|
|
648
|
-
.cast(pl.Boolean)
|
|
649
|
-
)
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
class LoadDecimalFromBytes(LoadFromBytesImpl):
|
|
653
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
654
|
-
import polars as pl
|
|
655
|
-
from polars._plr import PySeries
|
|
656
|
-
|
|
657
|
-
dtype = self.polars_dtype
|
|
658
|
-
assert isinstance(dtype, pl.Decimal)
|
|
659
|
-
assert dtype.precision is not None
|
|
660
|
-
|
|
661
|
-
return wrap_s(
|
|
662
|
-
PySeries._import_decimal_from_iceberg_binary_repr(
|
|
663
|
-
bytes_list=byte_values,
|
|
664
|
-
precision=dtype.precision,
|
|
665
|
-
scale=dtype.scale,
|
|
666
|
-
)
|
|
667
|
-
)
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
class LoadFixedFromBytes(LoadBinaryFromBytes): ...
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
class LoadInt32FromBytes(LoadFromBytesImpl):
|
|
674
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
675
|
-
import polars as pl
|
|
676
|
-
|
|
677
|
-
return pl.Series(byte_values, dtype=pl.Binary).bin.reinterpret(
|
|
678
|
-
dtype=pl.Int32, endianness="little"
|
|
679
|
-
)
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
class LoadInt64FromBytes(LoadFromBytesImpl):
|
|
683
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
684
|
-
import polars as pl
|
|
685
|
-
|
|
686
|
-
s = pl.Series(byte_values, dtype=pl.Binary)
|
|
687
|
-
|
|
688
|
-
return s.bin.reinterpret(dtype=pl.Int64, endianness="little").fill_null(
|
|
689
|
-
s.bin.reinterpret(dtype=pl.Int32, endianness="little").cast(pl.Int64)
|
|
690
|
-
)
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
class LoadStringFromBytes(LoadFromBytesImpl):
|
|
694
|
-
def load_from_bytes(self, byte_values: list[bytes | None]) -> pl.Series:
|
|
695
|
-
import polars as pl
|
|
696
|
-
|
|
697
|
-
return pl.Series(byte_values, dtype=pl.Binary).cast(pl.String)
|