polars-runtime-compat 1.34.0b2__cp39-abi3-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import partial
|
|
7
|
+
from time import perf_counter
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
|
+
|
|
10
|
+
import polars._reexport as pl
|
|
11
|
+
from polars._utils.logging import eprint, verbose
|
|
12
|
+
from polars.exceptions import ComputeError
|
|
13
|
+
from polars.io.iceberg._utils import (
|
|
14
|
+
IcebergStatisticsLoader,
|
|
15
|
+
IdentityTransformedPartitionValuesBuilder,
|
|
16
|
+
_scan_pyarrow_dataset_impl,
|
|
17
|
+
)
|
|
18
|
+
from polars.io.scan_options.cast_options import ScanCastOptions
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import pyarrow as pa
|
|
22
|
+
import pyiceberg.schema
|
|
23
|
+
from pyiceberg.table import Table
|
|
24
|
+
|
|
25
|
+
from polars.lazyframe.frame import LazyFrame
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class IcebergDataset:
|
|
29
|
+
"""Dataset interface for PyIceberg."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
source: str | Table,
|
|
34
|
+
*,
|
|
35
|
+
snapshot_id: int | None = None,
|
|
36
|
+
iceberg_storage_properties: dict[str, Any] | None = None,
|
|
37
|
+
reader_override: Literal["native", "pyiceberg"] | None = None,
|
|
38
|
+
use_metadata_statistics: bool = True,
|
|
39
|
+
) -> None:
|
|
40
|
+
self._metadata_path = None
|
|
41
|
+
self._table = None
|
|
42
|
+
self._snapshot_id = snapshot_id
|
|
43
|
+
self._iceberg_storage_properties = iceberg_storage_properties
|
|
44
|
+
self._reader_override: Literal["native", "pyiceberg"] | None = reader_override
|
|
45
|
+
self._use_metadata_statistics = use_metadata_statistics
|
|
46
|
+
|
|
47
|
+
# Accept either a path or a table object. The one we don't have is
|
|
48
|
+
# lazily initialized when needed.
|
|
49
|
+
|
|
50
|
+
if isinstance(source, str):
|
|
51
|
+
self._metadata_path = source
|
|
52
|
+
else:
|
|
53
|
+
self._table = source
|
|
54
|
+
|
|
55
|
+
#
|
|
56
|
+
# PythonDatasetProvider interface functions
|
|
57
|
+
#
|
|
58
|
+
|
|
59
|
+
def schema(self) -> pa.schema:
|
|
60
|
+
"""Fetch the schema of the table."""
|
|
61
|
+
return self.arrow_schema()
|
|
62
|
+
|
|
63
|
+
def arrow_schema(self) -> pa.schema:
|
|
64
|
+
"""Fetch the arrow schema of the table."""
|
|
65
|
+
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
|
66
|
+
|
|
67
|
+
return schema_to_pyarrow(self.table().schema())
|
|
68
|
+
|
|
69
|
+
def to_dataset_scan(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
existing_resolved_version_key: str | None = None,
|
|
73
|
+
limit: int | None = None,
|
|
74
|
+
projection: list[str] | None = None,
|
|
75
|
+
filter_columns: list[str] | None = None,
|
|
76
|
+
) -> tuple[LazyFrame, str] | None:
|
|
77
|
+
"""Construct a LazyFrame scan."""
|
|
78
|
+
if (
|
|
79
|
+
scan_data := self._to_dataset_scan_impl(
|
|
80
|
+
existing_resolved_version_key=existing_resolved_version_key,
|
|
81
|
+
limit=limit,
|
|
82
|
+
projection=projection,
|
|
83
|
+
filter_columns=filter_columns,
|
|
84
|
+
)
|
|
85
|
+
) is None:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
return scan_data.to_lazyframe(), scan_data.snapshot_id_key
|
|
89
|
+
|
|
90
|
+
def _to_dataset_scan_impl(
|
|
91
|
+
self,
|
|
92
|
+
*,
|
|
93
|
+
existing_resolved_version_key: str | None = None,
|
|
94
|
+
limit: int | None = None,
|
|
95
|
+
projection: list[str] | None = None,
|
|
96
|
+
filter_columns: list[str] | None = None,
|
|
97
|
+
) -> _NativeIcebergScanData | _PyIcebergScanData | None:
|
|
98
|
+
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
|
99
|
+
|
|
100
|
+
import polars._utils.logging
|
|
101
|
+
|
|
102
|
+
verbose = polars._utils.logging.verbose()
|
|
103
|
+
|
|
104
|
+
if verbose:
|
|
105
|
+
eprint(
|
|
106
|
+
"IcebergDataset: to_dataset_scan(): "
|
|
107
|
+
f"snapshot ID: {self._snapshot_id}, "
|
|
108
|
+
f"limit: {limit}, "
|
|
109
|
+
f"projection: {projection}, "
|
|
110
|
+
f"filter_columns: {filter_columns}, "
|
|
111
|
+
f"self._use_metadata_statistics: {self._use_metadata_statistics}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
tbl = self.table()
|
|
115
|
+
|
|
116
|
+
if verbose:
|
|
117
|
+
eprint(
|
|
118
|
+
"IcebergDataset: to_dataset_scan(): "
|
|
119
|
+
f"tbl.metadata.current_snapshot_id: {tbl.metadata.current_snapshot_id}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
snapshot_id = self._snapshot_id
|
|
123
|
+
schema_id = None
|
|
124
|
+
|
|
125
|
+
if snapshot_id is not None:
|
|
126
|
+
snapshot = tbl.snapshot_by_id(snapshot_id)
|
|
127
|
+
|
|
128
|
+
if snapshot is None:
|
|
129
|
+
msg = f"iceberg snapshot ID not found: {snapshot_id}"
|
|
130
|
+
raise ValueError(msg)
|
|
131
|
+
|
|
132
|
+
schema_id = snapshot.schema_id
|
|
133
|
+
|
|
134
|
+
if schema_id is None:
|
|
135
|
+
msg = (
|
|
136
|
+
f"IcebergDataset: requested snapshot {snapshot_id} "
|
|
137
|
+
"did not contain a schema ID"
|
|
138
|
+
)
|
|
139
|
+
raise ValueError(msg)
|
|
140
|
+
|
|
141
|
+
iceberg_schema = tbl.schemas()[schema_id]
|
|
142
|
+
snapshot_id_key = f"{snapshot.snapshot_id}"
|
|
143
|
+
else:
|
|
144
|
+
iceberg_schema = tbl.schema()
|
|
145
|
+
schema_id = tbl.metadata.current_schema_id
|
|
146
|
+
|
|
147
|
+
snapshot_id_key = (
|
|
148
|
+
f"{v.snapshot_id}" if (v := tbl.current_snapshot()) is not None else ""
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if (
|
|
152
|
+
existing_resolved_version_key is not None
|
|
153
|
+
and existing_resolved_version_key == snapshot_id_key
|
|
154
|
+
):
|
|
155
|
+
if verbose:
|
|
156
|
+
eprint(
|
|
157
|
+
"IcebergDataset: to_dataset_scan(): early return "
|
|
158
|
+
f"({snapshot_id_key = })"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
# Take from parameter first then envvar
|
|
164
|
+
reader_override = self._reader_override or os.getenv(
|
|
165
|
+
"POLARS_ICEBERG_READER_OVERRIDE"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if reader_override and reader_override not in ["native", "pyiceberg"]:
|
|
169
|
+
msg = (
|
|
170
|
+
"iceberg: unknown value for reader_override: "
|
|
171
|
+
f"'{reader_override}', expected one of ('native', 'pyiceberg')"
|
|
172
|
+
)
|
|
173
|
+
raise ValueError(msg)
|
|
174
|
+
|
|
175
|
+
fallback_reason = (
|
|
176
|
+
"forced reader_override='pyiceberg'"
|
|
177
|
+
if reader_override == "pyiceberg"
|
|
178
|
+
else f"unsupported table format version: {tbl.format_version}"
|
|
179
|
+
if not tbl.format_version <= 2
|
|
180
|
+
else None
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
selected_fields = ("*",) if projection is None else tuple(projection)
|
|
184
|
+
|
|
185
|
+
projected_iceberg_schema = (
|
|
186
|
+
iceberg_schema
|
|
187
|
+
if selected_fields == ("*",)
|
|
188
|
+
else iceberg_schema.select(*selected_fields)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
sources = []
|
|
192
|
+
missing_field_defaults = IdentityTransformedPartitionValuesBuilder(
|
|
193
|
+
tbl,
|
|
194
|
+
projected_iceberg_schema,
|
|
195
|
+
)
|
|
196
|
+
statistics_loader: IcebergStatisticsLoader | None = (
|
|
197
|
+
IcebergStatisticsLoader(tbl, iceberg_schema.select(*filter_columns))
|
|
198
|
+
if self._use_metadata_statistics and filter_columns is not None
|
|
199
|
+
else None
|
|
200
|
+
)
|
|
201
|
+
deletion_files: dict[int, list[str]] = {}
|
|
202
|
+
|
|
203
|
+
if reader_override != "pyiceberg" and not fallback_reason:
|
|
204
|
+
from pyiceberg.manifest import DataFileContent, FileFormat
|
|
205
|
+
|
|
206
|
+
if verbose:
|
|
207
|
+
eprint("IcebergDataset: to_dataset_scan(): begin path expansion")
|
|
208
|
+
|
|
209
|
+
start_time = perf_counter()
|
|
210
|
+
|
|
211
|
+
scan = tbl.scan(
|
|
212
|
+
snapshot_id=snapshot_id,
|
|
213
|
+
limit=limit,
|
|
214
|
+
selected_fields=selected_fields,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
total_deletion_files = 0
|
|
218
|
+
|
|
219
|
+
for i, file_info in enumerate(scan.plan_files()):
|
|
220
|
+
if file_info.file.file_format != FileFormat.PARQUET:
|
|
221
|
+
fallback_reason = (
|
|
222
|
+
f"non-parquet format: {file_info.file.file_format}"
|
|
223
|
+
)
|
|
224
|
+
break
|
|
225
|
+
|
|
226
|
+
if file_info.delete_files:
|
|
227
|
+
deletion_files[i] = []
|
|
228
|
+
|
|
229
|
+
for deletion_file in file_info.delete_files:
|
|
230
|
+
if deletion_file.content != DataFileContent.POSITION_DELETES:
|
|
231
|
+
fallback_reason = (
|
|
232
|
+
"unsupported deletion file type: "
|
|
233
|
+
f"{deletion_file.content}"
|
|
234
|
+
)
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
if deletion_file.file_format != FileFormat.PARQUET:
|
|
238
|
+
fallback_reason = (
|
|
239
|
+
"unsupported deletion file format: "
|
|
240
|
+
f"{deletion_file.file_format}"
|
|
241
|
+
)
|
|
242
|
+
break
|
|
243
|
+
|
|
244
|
+
deletion_files[i].append(deletion_file.file_path)
|
|
245
|
+
total_deletion_files += 1
|
|
246
|
+
|
|
247
|
+
if fallback_reason:
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
missing_field_defaults.push_partition_values(
|
|
251
|
+
current_index=i,
|
|
252
|
+
partition_spec_id=file_info.file.spec_id,
|
|
253
|
+
partition_values=file_info.file.partition,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if statistics_loader is not None:
|
|
257
|
+
statistics_loader.push_file_statistics(file_info.file)
|
|
258
|
+
|
|
259
|
+
sources.append(file_info.file.file_path)
|
|
260
|
+
|
|
261
|
+
if verbose:
|
|
262
|
+
elapsed = perf_counter() - start_time
|
|
263
|
+
eprint(
|
|
264
|
+
"IcebergDataset: to_dataset_scan(): "
|
|
265
|
+
f"finish path expansion ({elapsed:.3f}s)"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if not fallback_reason:
|
|
269
|
+
if verbose:
|
|
270
|
+
s = "" if len(sources) == 1 else "s"
|
|
271
|
+
s2 = "" if total_deletion_files == 1 else "s"
|
|
272
|
+
|
|
273
|
+
eprint(
|
|
274
|
+
"IcebergDataset: to_dataset_scan(): "
|
|
275
|
+
f"native scan_parquet(): "
|
|
276
|
+
f"{len(sources)} source{s}, "
|
|
277
|
+
f"snapshot ID: {snapshot_id}, "
|
|
278
|
+
f"schema ID: {schema_id}, "
|
|
279
|
+
f"{total_deletion_files} deletion file{s2}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# The arrow schema returned by `schema_to_pyarrow` will contain
|
|
283
|
+
# 'PARQUET:field_id'
|
|
284
|
+
column_mapping = schema_to_pyarrow(iceberg_schema)
|
|
285
|
+
|
|
286
|
+
identity_transformed_values = missing_field_defaults.finish()
|
|
287
|
+
|
|
288
|
+
min_max_statistics = (
|
|
289
|
+
statistics_loader.finish(len(sources), identity_transformed_values)
|
|
290
|
+
if statistics_loader is not None
|
|
291
|
+
else None
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
storage_options = (
|
|
295
|
+
_convert_iceberg_to_object_store_storage_options(
|
|
296
|
+
self._iceberg_storage_properties
|
|
297
|
+
)
|
|
298
|
+
if self._iceberg_storage_properties is not None
|
|
299
|
+
else None
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
return _NativeIcebergScanData(
|
|
303
|
+
sources=sources,
|
|
304
|
+
projected_iceberg_schema=projected_iceberg_schema,
|
|
305
|
+
column_mapping=column_mapping,
|
|
306
|
+
default_values=identity_transformed_values,
|
|
307
|
+
deletion_files=deletion_files,
|
|
308
|
+
min_max_statistics=min_max_statistics,
|
|
309
|
+
statistics_loader=statistics_loader,
|
|
310
|
+
storage_options=storage_options,
|
|
311
|
+
_snapshot_id_key=snapshot_id_key,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
elif reader_override == "native":
|
|
315
|
+
msg = f"iceberg reader_override='native' failed: {fallback_reason}"
|
|
316
|
+
raise ComputeError(msg)
|
|
317
|
+
|
|
318
|
+
if verbose:
|
|
319
|
+
eprint(
|
|
320
|
+
"IcebergDataset: to_dataset_scan(): "
|
|
321
|
+
f"fallback to python[pyiceberg] scan: {fallback_reason}"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
func = partial(
|
|
325
|
+
_scan_pyarrow_dataset_impl,
|
|
326
|
+
tbl,
|
|
327
|
+
snapshot_id=snapshot_id,
|
|
328
|
+
n_rows=limit,
|
|
329
|
+
with_columns=projection,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
arrow_schema = schema_to_pyarrow(tbl.schema())
|
|
333
|
+
|
|
334
|
+
lf = pl.LazyFrame._scan_python_function(
|
|
335
|
+
arrow_schema,
|
|
336
|
+
func,
|
|
337
|
+
pyarrow=True,
|
|
338
|
+
is_pure=True,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
return _PyIcebergScanData(lf=lf, _snapshot_id_key=snapshot_id_key)
|
|
342
|
+
|
|
343
|
+
#
|
|
344
|
+
# Accessors
|
|
345
|
+
#
|
|
346
|
+
|
|
347
|
+
def metadata_path(self) -> str:
|
|
348
|
+
"""Fetch the metadata path."""
|
|
349
|
+
if self._metadata_path is None:
|
|
350
|
+
if self._table is None:
|
|
351
|
+
msg = "impl error: both metadata_path and table are None"
|
|
352
|
+
raise ValueError(msg)
|
|
353
|
+
|
|
354
|
+
self._metadata_path = self.table().metadata_location
|
|
355
|
+
|
|
356
|
+
return self._metadata_path
|
|
357
|
+
|
|
358
|
+
def table(self) -> Table:
|
|
359
|
+
"""Fetch the PyIceberg Table object."""
|
|
360
|
+
if self._table is None:
|
|
361
|
+
if self._metadata_path is None:
|
|
362
|
+
msg = "impl error: both metadata_path and table are None"
|
|
363
|
+
raise ValueError(msg)
|
|
364
|
+
|
|
365
|
+
if verbose():
|
|
366
|
+
eprint(f"IcebergDataset: construct table from {self._metadata_path = }")
|
|
367
|
+
|
|
368
|
+
from pyiceberg.table import StaticTable
|
|
369
|
+
|
|
370
|
+
self._table = StaticTable.from_metadata(
|
|
371
|
+
metadata_location=self._metadata_path,
|
|
372
|
+
properties=self._iceberg_storage_properties or {},
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
return self._table
|
|
376
|
+
|
|
377
|
+
#
|
|
378
|
+
# Serialization functions
|
|
379
|
+
#
|
|
380
|
+
# We don't serialize the iceberg table object - the remote machine should
|
|
381
|
+
# use their own permissions to reconstruct the table object from the path.
|
|
382
|
+
#
|
|
383
|
+
|
|
384
|
+
def __getstate__(self) -> dict[str, Any]:
|
|
385
|
+
state = {
|
|
386
|
+
"metadata_path": self.metadata_path(),
|
|
387
|
+
"snapshot_id": self._snapshot_id,
|
|
388
|
+
"iceberg_storage_properties": self._iceberg_storage_properties,
|
|
389
|
+
"reader_override": self._reader_override,
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
if verbose():
|
|
393
|
+
path_repr = state["metadata_path"]
|
|
394
|
+
snapshot_id = f"'{v}'" if (v := state["snapshot_id"]) is not None else None
|
|
395
|
+
keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
|
|
396
|
+
reader_override = state["reader_override"]
|
|
397
|
+
|
|
398
|
+
eprint(
|
|
399
|
+
"IcebergDataset: getstate(): "
|
|
400
|
+
f"path: '{path_repr}', "
|
|
401
|
+
f"snapshot_id: {snapshot_id}, "
|
|
402
|
+
f"iceberg_storage_properties: {keys_repr}, "
|
|
403
|
+
f"reader_override: {reader_override}"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
return state
|
|
407
|
+
|
|
408
|
+
def __setstate__(self, state: dict[str, Any]) -> None:
|
|
409
|
+
if verbose():
|
|
410
|
+
path_repr = state["metadata_path"]
|
|
411
|
+
snapshot_id = state["snapshot_id"]
|
|
412
|
+
keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
|
|
413
|
+
reader_override = state["reader_override"]
|
|
414
|
+
|
|
415
|
+
eprint(
|
|
416
|
+
"IcebergDataset: getstate(): "
|
|
417
|
+
f"path: '{path_repr}', "
|
|
418
|
+
f"snapshot_id: '{snapshot_id}', "
|
|
419
|
+
f"iceberg_storage_properties: {keys_repr}, "
|
|
420
|
+
f"reader_override: {reader_override}"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
IcebergDataset.__init__(
|
|
424
|
+
self,
|
|
425
|
+
state["metadata_path"],
|
|
426
|
+
snapshot_id=state["snapshot_id"],
|
|
427
|
+
iceberg_storage_properties=state["iceberg_storage_properties"],
|
|
428
|
+
reader_override=state["reader_override"],
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class _ResolvedScanDataBase(ABC):
|
|
433
|
+
@abstractmethod
|
|
434
|
+
def to_lazyframe(self) -> pl.LazyFrame: ...
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
@abstractmethod
|
|
438
|
+
def snapshot_id_key(self) -> str: ...
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
@dataclass
|
|
442
|
+
class _NativeIcebergScanData(_ResolvedScanDataBase):
|
|
443
|
+
"""Resolved parameters for a native Iceberg scan."""
|
|
444
|
+
|
|
445
|
+
sources: list[str]
|
|
446
|
+
projected_iceberg_schema: pyiceberg.schema.Schema
|
|
447
|
+
column_mapping: pa.Schema
|
|
448
|
+
default_values: dict[int, pl.Series | str]
|
|
449
|
+
deletion_files: dict[int, list[str]]
|
|
450
|
+
min_max_statistics: pl.DataFrame | None
|
|
451
|
+
# This is here for test purposes, as the `min_max_statistics` on this
|
|
452
|
+
# dataclass contain coalesced values from `default_values`, a test may
|
|
453
|
+
# access the statistics loader directly to inspect the values before
|
|
454
|
+
# coalescing.
|
|
455
|
+
statistics_loader: IcebergStatisticsLoader | None
|
|
456
|
+
storage_options: dict[str, str] | None
|
|
457
|
+
_snapshot_id_key: str
|
|
458
|
+
|
|
459
|
+
def to_lazyframe(self) -> pl.LazyFrame:
|
|
460
|
+
from polars.io.parquet.functions import scan_parquet
|
|
461
|
+
|
|
462
|
+
return scan_parquet(
|
|
463
|
+
self.sources,
|
|
464
|
+
cast_options=ScanCastOptions._default_iceberg(),
|
|
465
|
+
missing_columns="insert",
|
|
466
|
+
extra_columns="ignore",
|
|
467
|
+
storage_options=self.storage_options,
|
|
468
|
+
_column_mapping=("iceberg-column-mapping", self.column_mapping),
|
|
469
|
+
_default_values=("iceberg", self.default_values),
|
|
470
|
+
_deletion_files=("iceberg-position-delete", self.deletion_files),
|
|
471
|
+
_table_statistics=self.min_max_statistics,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def snapshot_id_key(self) -> str:
|
|
476
|
+
return self._snapshot_id_key
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
@dataclass
|
|
480
|
+
class _PyIcebergScanData(_ResolvedScanDataBase):
|
|
481
|
+
"""Resolved parameters for reading via PyIceberg."""
|
|
482
|
+
|
|
483
|
+
# We're not interested in inspecting anything for the pyiceberg scan, so
|
|
484
|
+
# this class is just a wrapper.
|
|
485
|
+
lf: pl.LazyFrame
|
|
486
|
+
_snapshot_id_key: str
|
|
487
|
+
|
|
488
|
+
def to_lazyframe(self) -> pl.LazyFrame:
|
|
489
|
+
return self.lf
|
|
490
|
+
|
|
491
|
+
@property
|
|
492
|
+
def snapshot_id_key(self) -> str:
|
|
493
|
+
return self._snapshot_id_key
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _redact_dict_values(obj: Any) -> Any:
|
|
497
|
+
return (
|
|
498
|
+
{k: "REDACTED" for k in obj.keys()} # noqa: SIM118
|
|
499
|
+
if isinstance(obj, dict)
|
|
500
|
+
else f"<{type(obj).__name__} object>"
|
|
501
|
+
if obj is not None
|
|
502
|
+
else "None"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _convert_iceberg_to_object_store_storage_options(
|
|
507
|
+
iceberg_storage_properties: dict[str, str],
|
|
508
|
+
) -> dict[str, str]:
|
|
509
|
+
storage_options = {}
|
|
510
|
+
|
|
511
|
+
for k, v in iceberg_storage_properties.items():
|
|
512
|
+
if (
|
|
513
|
+
translated_key := ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP.get(k)
|
|
514
|
+
) is not None:
|
|
515
|
+
storage_options[translated_key] = v
|
|
516
|
+
elif "." not in k:
|
|
517
|
+
# Pass-through non-Iceberg config keys, as they may be native config
|
|
518
|
+
# keys. We identify Iceberg keys by checking for a dot - from
|
|
519
|
+
# observation nearly all Iceberg config keys contain dots, whereas
|
|
520
|
+
# native config keys do not contain them.
|
|
521
|
+
storage_options[k] = v
|
|
522
|
+
|
|
523
|
+
# Otherwise, unknown keys are ignored / not passed. This is to avoid
|
|
524
|
+
# interfering with credential provider auto-init, which bails on
|
|
525
|
+
# unknown keys.
|
|
526
|
+
|
|
527
|
+
return storage_options
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# https://py.iceberg.apache.org/configuration/#fileio
|
|
531
|
+
# This does not contain all keys - some have no object-store equivalent.
|
|
532
|
+
ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP: dict[str, str] = {
|
|
533
|
+
# S3
|
|
534
|
+
"s3.endpoint": "aws_endpoint_url",
|
|
535
|
+
"s3.access-key-id": "aws_access_key_id",
|
|
536
|
+
"s3.secret-access-key": "aws_secret_access_key",
|
|
537
|
+
"s3.session-token": "aws_session_token",
|
|
538
|
+
"s3.region": "aws_region",
|
|
539
|
+
"s3.proxy-uri": "proxy_url",
|
|
540
|
+
"s3.connect-timeout": "connect_timeout",
|
|
541
|
+
"s3.request-timeout": "timeout",
|
|
542
|
+
"s3.force-virtual-addressing": "aws_virtual_hosted_style_request",
|
|
543
|
+
# Azure
|
|
544
|
+
"adls.account-name": "azure_storage_account_name",
|
|
545
|
+
"adls.account-key": "azure_storage_account_key",
|
|
546
|
+
"adls.sas-token": "azure_storage_sas_key",
|
|
547
|
+
"adls.tenant-id": "azure_storage_tenant_id",
|
|
548
|
+
"adls.client-id": "azure_storage_client_id",
|
|
549
|
+
"adls.client-secret": "azure_storage_client_secret",
|
|
550
|
+
"adls.account-host": "azure_storage_authority_host",
|
|
551
|
+
"adls.token": "azure_storage_token",
|
|
552
|
+
# Google storage
|
|
553
|
+
"gcs.oauth2.token": "bearer_token",
|
|
554
|
+
# HuggingFace
|
|
555
|
+
"hf.token": "token",
|
|
556
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
4
|
+
|
|
5
|
+
from polars._utils.unstable import issue_unstable_warning
|
|
6
|
+
from polars._utils.wrap import wrap_ldf
|
|
7
|
+
from polars.io.iceberg.dataset import IcebergDataset
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pyiceberg.table import Table
|
|
11
|
+
|
|
12
|
+
from polars.lazyframe.frame import LazyFrame
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def scan_iceberg(
|
|
16
|
+
source: str | Table,
|
|
17
|
+
*,
|
|
18
|
+
snapshot_id: int | None = None,
|
|
19
|
+
storage_options: dict[str, Any] | None = None,
|
|
20
|
+
reader_override: Literal["native", "pyiceberg"] | None = None,
|
|
21
|
+
use_metadata_statistics: bool = True,
|
|
22
|
+
) -> LazyFrame:
|
|
23
|
+
"""
|
|
24
|
+
Lazily read from an Apache Iceberg table.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
source
|
|
29
|
+
A PyIceberg table, or a direct path to the metadata.
|
|
30
|
+
|
|
31
|
+
Note: For Local filesystem, absolute and relative paths are supported but
|
|
32
|
+
for the supported object storages - GCS, Azure and S3 full URI must be provided.
|
|
33
|
+
snapshot_id
|
|
34
|
+
The snapshot ID to scan from.
|
|
35
|
+
storage_options
|
|
36
|
+
Extra options for the storage backends supported by `pyiceberg`.
|
|
37
|
+
For cloud storages, this may include configurations for authentication etc.
|
|
38
|
+
|
|
39
|
+
More info is available `here <https://py.iceberg.apache.org/configuration/>`__.
|
|
40
|
+
reader_override
|
|
41
|
+
Overrides the reader used to read the data.
|
|
42
|
+
|
|
43
|
+
.. warning::
|
|
44
|
+
This functionality is considered **unstable**. It may be changed
|
|
45
|
+
at any point without it being considered a breaking change.
|
|
46
|
+
|
|
47
|
+
Note that this parameter should not be necessary outside of testing, as
|
|
48
|
+
polars will by default automatically select the best reader.
|
|
49
|
+
|
|
50
|
+
Available options:
|
|
51
|
+
|
|
52
|
+
* native: Uses polars native reader. This allows for more optimizations to
|
|
53
|
+
improve performance.
|
|
54
|
+
* pyiceberg: Uses PyIceberg, which may support more features.
|
|
55
|
+
use_metadata_statistics
|
|
56
|
+
Load and use min/max statistics from Iceberg metadata files when a filter
|
|
57
|
+
is present. This allows the reader to potentially skip loading metadata
|
|
58
|
+
from the underlying data files.
|
|
59
|
+
|
|
60
|
+
.. warning::
|
|
61
|
+
This functionality is considered **unstable**. It may be changed
|
|
62
|
+
at any point without it being considered a breaking change.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
LazyFrame
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
Creates a scan for an Iceberg table from local filesystem, or object store.
|
|
71
|
+
|
|
72
|
+
>>> table_path = "file:/path/to/iceberg-table/metadata.json"
|
|
73
|
+
>>> pl.scan_iceberg(table_path).collect() # doctest: +SKIP
|
|
74
|
+
|
|
75
|
+
Creates a scan for an Iceberg table from S3.
|
|
76
|
+
See a list of supported storage options for S3 `here
|
|
77
|
+
<https://py.iceberg.apache.org/configuration/#fileio>`__.
|
|
78
|
+
|
|
79
|
+
>>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
|
|
80
|
+
>>> storage_options = {
|
|
81
|
+
... "s3.region": "eu-central-1",
|
|
82
|
+
... "s3.access-key-id": "THE_AWS_ACCESS_KEY_ID",
|
|
83
|
+
... "s3.secret-access-key": "THE_AWS_SECRET_ACCESS_KEY",
|
|
84
|
+
... }
|
|
85
|
+
>>> pl.scan_iceberg(
|
|
86
|
+
... table_path, storage_options=storage_options
|
|
87
|
+
... ).collect() # doctest: +SKIP
|
|
88
|
+
|
|
89
|
+
Creates a scan for an Iceberg table from Azure.
|
|
90
|
+
Supported options for Azure are available `here
|
|
91
|
+
<https://py.iceberg.apache.org/configuration/#azure-data-lake>`__.
|
|
92
|
+
|
|
93
|
+
Following type of table paths are supported:
|
|
94
|
+
|
|
95
|
+
* az://<container>/<path>/metadata.json
|
|
96
|
+
* adl://<container>/<path>/metadata.json
|
|
97
|
+
* abfs[s]://<container>/<path>/metadata.json
|
|
98
|
+
|
|
99
|
+
>>> table_path = "az://container/path/to/iceberg-table/metadata.json"
|
|
100
|
+
>>> storage_options = {
|
|
101
|
+
... "adlfs.account-name": "AZURE_STORAGE_ACCOUNT_NAME",
|
|
102
|
+
... "adlfs.account-key": "AZURE_STORAGE_ACCOUNT_KEY",
|
|
103
|
+
... }
|
|
104
|
+
>>> pl.scan_iceberg(
|
|
105
|
+
... table_path, storage_options=storage_options
|
|
106
|
+
... ).collect() # doctest: +SKIP
|
|
107
|
+
|
|
108
|
+
Creates a scan for an Iceberg table from Google Cloud Storage.
|
|
109
|
+
Supported options for GCS are available `here
|
|
110
|
+
<https://py.iceberg.apache.org/configuration/#google-cloud-storage>`__.
|
|
111
|
+
|
|
112
|
+
>>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
|
|
113
|
+
>>> storage_options = {
|
|
114
|
+
... "gcs.project-id": "my-gcp-project",
|
|
115
|
+
... "gcs.oauth.token": "ya29.dr.AfM...",
|
|
116
|
+
... }
|
|
117
|
+
>>> pl.scan_iceberg(
|
|
118
|
+
... table_path, storage_options=storage_options
|
|
119
|
+
... ).collect() # doctest: +SKIP
|
|
120
|
+
|
|
121
|
+
Creates a scan for an Iceberg table with additional options.
|
|
122
|
+
In the below example, `without_files` option is used which loads the table without
|
|
123
|
+
file tracking information.
|
|
124
|
+
|
|
125
|
+
>>> table_path = "/path/to/iceberg-table/metadata.json"
|
|
126
|
+
>>> storage_options = {"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"}
|
|
127
|
+
>>> pl.scan_iceberg(
|
|
128
|
+
... table_path, storage_options=storage_options
|
|
129
|
+
... ).collect() # doctest: +SKIP
|
|
130
|
+
|
|
131
|
+
Creates a scan for an Iceberg table using a specific snapshot ID.
|
|
132
|
+
|
|
133
|
+
>>> table_path = "/path/to/iceberg-table/metadata.json"
|
|
134
|
+
>>> snapshot_id = 7051579356916758811
|
|
135
|
+
>>> pl.scan_iceberg(table_path, snapshot_id=snapshot_id).collect() # doctest: +SKIP
|
|
136
|
+
"""
|
|
137
|
+
from polars._plr import PyLazyFrame
|
|
138
|
+
|
|
139
|
+
if reader_override is not None:
|
|
140
|
+
msg = "the `reader_override` parameter of `scan_iceberg()` is considered unstable."
|
|
141
|
+
issue_unstable_warning(msg)
|
|
142
|
+
|
|
143
|
+
dataset = IcebergDataset(
|
|
144
|
+
source,
|
|
145
|
+
snapshot_id=snapshot_id,
|
|
146
|
+
iceberg_storage_properties=storage_options,
|
|
147
|
+
reader_override=reader_override,
|
|
148
|
+
use_metadata_statistics=use_metadata_statistics,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return wrap_ldf(PyLazyFrame.new_from_dataset_object(dataset))
|