polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
  3. polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  202. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
@@ -1,556 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- from abc import ABC, abstractmethod
5
- from dataclasses import dataclass
6
- from functools import partial
7
- from time import perf_counter
8
- from typing import TYPE_CHECKING, Any, Literal
9
-
10
- import polars._reexport as pl
11
- from polars._utils.logging import eprint, verbose
12
- from polars.exceptions import ComputeError
13
- from polars.io.iceberg._utils import (
14
- IcebergStatisticsLoader,
15
- IdentityTransformedPartitionValuesBuilder,
16
- _scan_pyarrow_dataset_impl,
17
- )
18
- from polars.io.scan_options.cast_options import ScanCastOptions
19
-
20
- if TYPE_CHECKING:
21
- import pyarrow as pa
22
- import pyiceberg.schema
23
- from pyiceberg.table import Table
24
-
25
- from polars.lazyframe.frame import LazyFrame
26
-
27
-
28
- class IcebergDataset:
29
- """Dataset interface for PyIceberg."""
30
-
31
- def __init__(
32
- self,
33
- source: str | Table,
34
- *,
35
- snapshot_id: int | None = None,
36
- iceberg_storage_properties: dict[str, Any] | None = None,
37
- reader_override: Literal["native", "pyiceberg"] | None = None,
38
- use_metadata_statistics: bool = True,
39
- ) -> None:
40
- self._metadata_path = None
41
- self._table = None
42
- self._snapshot_id = snapshot_id
43
- self._iceberg_storage_properties = iceberg_storage_properties
44
- self._reader_override: Literal["native", "pyiceberg"] | None = reader_override
45
- self._use_metadata_statistics = use_metadata_statistics
46
-
47
- # Accept either a path or a table object. The one we don't have is
48
- # lazily initialized when needed.
49
-
50
- if isinstance(source, str):
51
- self._metadata_path = source
52
- else:
53
- self._table = source
54
-
55
- #
56
- # PythonDatasetProvider interface functions
57
- #
58
-
59
- def schema(self) -> pa.schema:
60
- """Fetch the schema of the table."""
61
- return self.arrow_schema()
62
-
63
- def arrow_schema(self) -> pa.schema:
64
- """Fetch the arrow schema of the table."""
65
- from pyiceberg.io.pyarrow import schema_to_pyarrow
66
-
67
- return schema_to_pyarrow(self.table().schema())
68
-
69
- def to_dataset_scan(
70
- self,
71
- *,
72
- existing_resolved_version_key: str | None = None,
73
- limit: int | None = None,
74
- projection: list[str] | None = None,
75
- filter_columns: list[str] | None = None,
76
- ) -> tuple[LazyFrame, str] | None:
77
- """Construct a LazyFrame scan."""
78
- if (
79
- scan_data := self._to_dataset_scan_impl(
80
- existing_resolved_version_key=existing_resolved_version_key,
81
- limit=limit,
82
- projection=projection,
83
- filter_columns=filter_columns,
84
- )
85
- ) is None:
86
- return None
87
-
88
- return scan_data.to_lazyframe(), scan_data.snapshot_id_key
89
-
90
- def _to_dataset_scan_impl(
91
- self,
92
- *,
93
- existing_resolved_version_key: str | None = None,
94
- limit: int | None = None,
95
- projection: list[str] | None = None,
96
- filter_columns: list[str] | None = None,
97
- ) -> _NativeIcebergScanData | _PyIcebergScanData | None:
98
- from pyiceberg.io.pyarrow import schema_to_pyarrow
99
-
100
- import polars._utils.logging
101
-
102
- verbose = polars._utils.logging.verbose()
103
-
104
- if verbose:
105
- eprint(
106
- "IcebergDataset: to_dataset_scan(): "
107
- f"snapshot ID: {self._snapshot_id}, "
108
- f"limit: {limit}, "
109
- f"projection: {projection}, "
110
- f"filter_columns: {filter_columns}, "
111
- f"self._use_metadata_statistics: {self._use_metadata_statistics}"
112
- )
113
-
114
- tbl = self.table()
115
-
116
- if verbose:
117
- eprint(
118
- "IcebergDataset: to_dataset_scan(): "
119
- f"tbl.metadata.current_snapshot_id: {tbl.metadata.current_snapshot_id}"
120
- )
121
-
122
- snapshot_id = self._snapshot_id
123
- schema_id = None
124
-
125
- if snapshot_id is not None:
126
- snapshot = tbl.snapshot_by_id(snapshot_id)
127
-
128
- if snapshot is None:
129
- msg = f"iceberg snapshot ID not found: {snapshot_id}"
130
- raise ValueError(msg)
131
-
132
- schema_id = snapshot.schema_id
133
-
134
- if schema_id is None:
135
- msg = (
136
- f"IcebergDataset: requested snapshot {snapshot_id} "
137
- "did not contain a schema ID"
138
- )
139
- raise ValueError(msg)
140
-
141
- iceberg_schema = tbl.schemas()[schema_id]
142
- snapshot_id_key = f"{snapshot.snapshot_id}"
143
- else:
144
- iceberg_schema = tbl.schema()
145
- schema_id = tbl.metadata.current_schema_id
146
-
147
- snapshot_id_key = (
148
- f"{v.snapshot_id}" if (v := tbl.current_snapshot()) is not None else ""
149
- )
150
-
151
- if (
152
- existing_resolved_version_key is not None
153
- and existing_resolved_version_key == snapshot_id_key
154
- ):
155
- if verbose:
156
- eprint(
157
- "IcebergDataset: to_dataset_scan(): early return "
158
- f"({snapshot_id_key = })"
159
- )
160
-
161
- return None
162
-
163
- # Take from parameter first then envvar
164
- reader_override = self._reader_override or os.getenv(
165
- "POLARS_ICEBERG_READER_OVERRIDE"
166
- )
167
-
168
- if reader_override and reader_override not in ["native", "pyiceberg"]:
169
- msg = (
170
- "iceberg: unknown value for reader_override: "
171
- f"'{reader_override}', expected one of ('native', 'pyiceberg')"
172
- )
173
- raise ValueError(msg)
174
-
175
- fallback_reason = (
176
- "forced reader_override='pyiceberg'"
177
- if reader_override == "pyiceberg"
178
- else f"unsupported table format version: {tbl.format_version}"
179
- if not tbl.format_version <= 2
180
- else None
181
- )
182
-
183
- selected_fields = ("*",) if projection is None else tuple(projection)
184
-
185
- projected_iceberg_schema = (
186
- iceberg_schema
187
- if selected_fields == ("*",)
188
- else iceberg_schema.select(*selected_fields)
189
- )
190
-
191
- sources = []
192
- missing_field_defaults = IdentityTransformedPartitionValuesBuilder(
193
- tbl,
194
- projected_iceberg_schema,
195
- )
196
- statistics_loader: IcebergStatisticsLoader | None = (
197
- IcebergStatisticsLoader(tbl, iceberg_schema.select(*filter_columns))
198
- if self._use_metadata_statistics and filter_columns is not None
199
- else None
200
- )
201
- deletion_files: dict[int, list[str]] = {}
202
-
203
- if reader_override != "pyiceberg" and not fallback_reason:
204
- from pyiceberg.manifest import DataFileContent, FileFormat
205
-
206
- if verbose:
207
- eprint("IcebergDataset: to_dataset_scan(): begin path expansion")
208
-
209
- start_time = perf_counter()
210
-
211
- scan = tbl.scan(
212
- snapshot_id=snapshot_id,
213
- limit=limit,
214
- selected_fields=selected_fields,
215
- )
216
-
217
- total_deletion_files = 0
218
-
219
- for i, file_info in enumerate(scan.plan_files()):
220
- if file_info.file.file_format != FileFormat.PARQUET:
221
- fallback_reason = (
222
- f"non-parquet format: {file_info.file.file_format}"
223
- )
224
- break
225
-
226
- if file_info.delete_files:
227
- deletion_files[i] = []
228
-
229
- for deletion_file in file_info.delete_files:
230
- if deletion_file.content != DataFileContent.POSITION_DELETES:
231
- fallback_reason = (
232
- "unsupported deletion file type: "
233
- f"{deletion_file.content}"
234
- )
235
- break
236
-
237
- if deletion_file.file_format != FileFormat.PARQUET:
238
- fallback_reason = (
239
- "unsupported deletion file format: "
240
- f"{deletion_file.file_format}"
241
- )
242
- break
243
-
244
- deletion_files[i].append(deletion_file.file_path)
245
- total_deletion_files += 1
246
-
247
- if fallback_reason:
248
- break
249
-
250
- missing_field_defaults.push_partition_values(
251
- current_index=i,
252
- partition_spec_id=file_info.file.spec_id,
253
- partition_values=file_info.file.partition,
254
- )
255
-
256
- if statistics_loader is not None:
257
- statistics_loader.push_file_statistics(file_info.file)
258
-
259
- sources.append(file_info.file.file_path)
260
-
261
- if verbose:
262
- elapsed = perf_counter() - start_time
263
- eprint(
264
- "IcebergDataset: to_dataset_scan(): "
265
- f"finish path expansion ({elapsed:.3f}s)"
266
- )
267
-
268
- if not fallback_reason:
269
- if verbose:
270
- s = "" if len(sources) == 1 else "s"
271
- s2 = "" if total_deletion_files == 1 else "s"
272
-
273
- eprint(
274
- "IcebergDataset: to_dataset_scan(): "
275
- f"native scan_parquet(): "
276
- f"{len(sources)} source{s}, "
277
- f"snapshot ID: {snapshot_id}, "
278
- f"schema ID: {schema_id}, "
279
- f"{total_deletion_files} deletion file{s2}"
280
- )
281
-
282
- # The arrow schema returned by `schema_to_pyarrow` will contain
283
- # 'PARQUET:field_id'
284
- column_mapping = schema_to_pyarrow(iceberg_schema)
285
-
286
- identity_transformed_values = missing_field_defaults.finish()
287
-
288
- min_max_statistics = (
289
- statistics_loader.finish(len(sources), identity_transformed_values)
290
- if statistics_loader is not None
291
- else None
292
- )
293
-
294
- storage_options = (
295
- _convert_iceberg_to_object_store_storage_options(
296
- self._iceberg_storage_properties
297
- )
298
- if self._iceberg_storage_properties is not None
299
- else None
300
- )
301
-
302
- return _NativeIcebergScanData(
303
- sources=sources,
304
- projected_iceberg_schema=projected_iceberg_schema,
305
- column_mapping=column_mapping,
306
- default_values=identity_transformed_values,
307
- deletion_files=deletion_files,
308
- min_max_statistics=min_max_statistics,
309
- statistics_loader=statistics_loader,
310
- storage_options=storage_options,
311
- _snapshot_id_key=snapshot_id_key,
312
- )
313
-
314
- elif reader_override == "native":
315
- msg = f"iceberg reader_override='native' failed: {fallback_reason}"
316
- raise ComputeError(msg)
317
-
318
- if verbose:
319
- eprint(
320
- "IcebergDataset: to_dataset_scan(): "
321
- f"fallback to python[pyiceberg] scan: {fallback_reason}"
322
- )
323
-
324
- func = partial(
325
- _scan_pyarrow_dataset_impl,
326
- tbl,
327
- snapshot_id=snapshot_id,
328
- n_rows=limit,
329
- with_columns=projection,
330
- )
331
-
332
- arrow_schema = schema_to_pyarrow(tbl.schema())
333
-
334
- lf = pl.LazyFrame._scan_python_function(
335
- arrow_schema,
336
- func,
337
- pyarrow=True,
338
- is_pure=True,
339
- )
340
-
341
- return _PyIcebergScanData(lf=lf, _snapshot_id_key=snapshot_id_key)
342
-
343
- #
344
- # Accessors
345
- #
346
-
347
- def metadata_path(self) -> str:
348
- """Fetch the metadata path."""
349
- if self._metadata_path is None:
350
- if self._table is None:
351
- msg = "impl error: both metadata_path and table are None"
352
- raise ValueError(msg)
353
-
354
- self._metadata_path = self.table().metadata_location
355
-
356
- return self._metadata_path
357
-
358
- def table(self) -> Table:
359
- """Fetch the PyIceberg Table object."""
360
- if self._table is None:
361
- if self._metadata_path is None:
362
- msg = "impl error: both metadata_path and table are None"
363
- raise ValueError(msg)
364
-
365
- if verbose():
366
- eprint(f"IcebergDataset: construct table from {self._metadata_path = }")
367
-
368
- from pyiceberg.table import StaticTable
369
-
370
- self._table = StaticTable.from_metadata(
371
- metadata_location=self._metadata_path,
372
- properties=self._iceberg_storage_properties or {},
373
- )
374
-
375
- return self._table
376
-
377
- #
378
- # Serialization functions
379
- #
380
- # We don't serialize the iceberg table object - the remote machine should
381
- # use their own permissions to reconstruct the table object from the path.
382
- #
383
-
384
- def __getstate__(self) -> dict[str, Any]:
385
- state = {
386
- "metadata_path": self.metadata_path(),
387
- "snapshot_id": self._snapshot_id,
388
- "iceberg_storage_properties": self._iceberg_storage_properties,
389
- "reader_override": self._reader_override,
390
- }
391
-
392
- if verbose():
393
- path_repr = state["metadata_path"]
394
- snapshot_id = f"'{v}'" if (v := state["snapshot_id"]) is not None else None
395
- keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
396
- reader_override = state["reader_override"]
397
-
398
- eprint(
399
- "IcebergDataset: getstate(): "
400
- f"path: '{path_repr}', "
401
- f"snapshot_id: {snapshot_id}, "
402
- f"iceberg_storage_properties: {keys_repr}, "
403
- f"reader_override: {reader_override}"
404
- )
405
-
406
- return state
407
-
408
- def __setstate__(self, state: dict[str, Any]) -> None:
409
- if verbose():
410
- path_repr = state["metadata_path"]
411
- snapshot_id = state["snapshot_id"]
412
- keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
413
- reader_override = state["reader_override"]
414
-
415
- eprint(
416
- "IcebergDataset: getstate(): "
417
- f"path: '{path_repr}', "
418
- f"snapshot_id: '{snapshot_id}', "
419
- f"iceberg_storage_properties: {keys_repr}, "
420
- f"reader_override: {reader_override}"
421
- )
422
-
423
- IcebergDataset.__init__(
424
- self,
425
- state["metadata_path"],
426
- snapshot_id=state["snapshot_id"],
427
- iceberg_storage_properties=state["iceberg_storage_properties"],
428
- reader_override=state["reader_override"],
429
- )
430
-
431
-
432
- class _ResolvedScanDataBase(ABC):
433
- @abstractmethod
434
- def to_lazyframe(self) -> pl.LazyFrame: ...
435
-
436
- @property
437
- @abstractmethod
438
- def snapshot_id_key(self) -> str: ...
439
-
440
-
441
- @dataclass
442
- class _NativeIcebergScanData(_ResolvedScanDataBase):
443
- """Resolved parameters for a native Iceberg scan."""
444
-
445
- sources: list[str]
446
- projected_iceberg_schema: pyiceberg.schema.Schema
447
- column_mapping: pa.Schema
448
- default_values: dict[int, pl.Series | str]
449
- deletion_files: dict[int, list[str]]
450
- min_max_statistics: pl.DataFrame | None
451
- # This is here for test purposes, as the `min_max_statistics` on this
452
- # dataclass contain coalesced values from `default_values`, a test may
453
- # access the statistics loader directly to inspect the values before
454
- # coalescing.
455
- statistics_loader: IcebergStatisticsLoader | None
456
- storage_options: dict[str, str] | None
457
- _snapshot_id_key: str
458
-
459
- def to_lazyframe(self) -> pl.LazyFrame:
460
- from polars.io.parquet.functions import scan_parquet
461
-
462
- return scan_parquet(
463
- self.sources,
464
- cast_options=ScanCastOptions._default_iceberg(),
465
- missing_columns="insert",
466
- extra_columns="ignore",
467
- storage_options=self.storage_options,
468
- _column_mapping=("iceberg-column-mapping", self.column_mapping),
469
- _default_values=("iceberg", self.default_values),
470
- _deletion_files=("iceberg-position-delete", self.deletion_files),
471
- _table_statistics=self.min_max_statistics,
472
- )
473
-
474
- @property
475
- def snapshot_id_key(self) -> str:
476
- return self._snapshot_id_key
477
-
478
-
479
- @dataclass
480
- class _PyIcebergScanData(_ResolvedScanDataBase):
481
- """Resolved parameters for reading via PyIceberg."""
482
-
483
- # We're not interested in inspecting anything for the pyiceberg scan, so
484
- # this class is just a wrapper.
485
- lf: pl.LazyFrame
486
- _snapshot_id_key: str
487
-
488
- def to_lazyframe(self) -> pl.LazyFrame:
489
- return self.lf
490
-
491
- @property
492
- def snapshot_id_key(self) -> str:
493
- return self._snapshot_id_key
494
-
495
-
496
- def _redact_dict_values(obj: Any) -> Any:
497
- return (
498
- {k: "REDACTED" for k in obj.keys()} # noqa: SIM118
499
- if isinstance(obj, dict)
500
- else f"<{type(obj).__name__} object>"
501
- if obj is not None
502
- else "None"
503
- )
504
-
505
-
506
- def _convert_iceberg_to_object_store_storage_options(
507
- iceberg_storage_properties: dict[str, str],
508
- ) -> dict[str, str]:
509
- storage_options = {}
510
-
511
- for k, v in iceberg_storage_properties.items():
512
- if (
513
- translated_key := ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP.get(k)
514
- ) is not None:
515
- storage_options[translated_key] = v
516
- elif "." not in k:
517
- # Pass-through non-Iceberg config keys, as they may be native config
518
- # keys. We identify Iceberg keys by checking for a dot - from
519
- # observation nearly all Iceberg config keys contain dots, whereas
520
- # native config keys do not contain them.
521
- storage_options[k] = v
522
-
523
- # Otherwise, unknown keys are ignored / not passed. This is to avoid
524
- # interfering with credential provider auto-init, which bails on
525
- # unknown keys.
526
-
527
- return storage_options
528
-
529
-
530
- # https://py.iceberg.apache.org/configuration/#fileio
531
- # This does not contain all keys - some have no object-store equivalent.
532
- ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP: dict[str, str] = {
533
- # S3
534
- "s3.endpoint": "aws_endpoint_url",
535
- "s3.access-key-id": "aws_access_key_id",
536
- "s3.secret-access-key": "aws_secret_access_key",
537
- "s3.session-token": "aws_session_token",
538
- "s3.region": "aws_region",
539
- "s3.proxy-uri": "proxy_url",
540
- "s3.connect-timeout": "connect_timeout",
541
- "s3.request-timeout": "timeout",
542
- "s3.force-virtual-addressing": "aws_virtual_hosted_style_request",
543
- # Azure
544
- "adls.account-name": "azure_storage_account_name",
545
- "adls.account-key": "azure_storage_account_key",
546
- "adls.sas-token": "azure_storage_sas_key",
547
- "adls.tenant-id": "azure_storage_tenant_id",
548
- "adls.client-id": "azure_storage_client_id",
549
- "adls.client-secret": "azure_storage_client_secret",
550
- "adls.account-host": "azure_storage_authority_host",
551
- "adls.token": "azure_storage_token",
552
- # Google storage
553
- "gcs.oauth2.token": "bearer_token",
554
- # HuggingFace
555
- "hf.token": "token",
556
- }
@@ -1,151 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Any, Literal
4
-
5
- from polars._utils.unstable import issue_unstable_warning
6
- from polars._utils.wrap import wrap_ldf
7
- from polars.io.iceberg.dataset import IcebergDataset
8
-
9
- if TYPE_CHECKING:
10
- from pyiceberg.table import Table
11
-
12
- from polars.lazyframe.frame import LazyFrame
13
-
14
-
15
- def scan_iceberg(
16
- source: str | Table,
17
- *,
18
- snapshot_id: int | None = None,
19
- storage_options: dict[str, Any] | None = None,
20
- reader_override: Literal["native", "pyiceberg"] | None = None,
21
- use_metadata_statistics: bool = True,
22
- ) -> LazyFrame:
23
- """
24
- Lazily read from an Apache Iceberg table.
25
-
26
- Parameters
27
- ----------
28
- source
29
- A PyIceberg table, or a direct path to the metadata.
30
-
31
- Note: For Local filesystem, absolute and relative paths are supported but
32
- for the supported object storages - GCS, Azure and S3 full URI must be provided.
33
- snapshot_id
34
- The snapshot ID to scan from.
35
- storage_options
36
- Extra options for the storage backends supported by `pyiceberg`.
37
- For cloud storages, this may include configurations for authentication etc.
38
-
39
- More info is available `here <https://py.iceberg.apache.org/configuration/>`__.
40
- reader_override
41
- Overrides the reader used to read the data.
42
-
43
- .. warning::
44
- This functionality is considered **unstable**. It may be changed
45
- at any point without it being considered a breaking change.
46
-
47
- Note that this parameter should not be necessary outside of testing, as
48
- polars will by default automatically select the best reader.
49
-
50
- Available options:
51
-
52
- * native: Uses polars native reader. This allows for more optimizations to
53
- improve performance.
54
- * pyiceberg: Uses PyIceberg, which may support more features.
55
- use_metadata_statistics
56
- Load and use min/max statistics from Iceberg metadata files when a filter
57
- is present. This allows the reader to potentially skip loading metadata
58
- from the underlying data files.
59
-
60
- .. warning::
61
- This functionality is considered **unstable**. It may be changed
62
- at any point without it being considered a breaking change.
63
-
64
- Returns
65
- -------
66
- LazyFrame
67
-
68
- Examples
69
- --------
70
- Creates a scan for an Iceberg table from local filesystem, or object store.
71
-
72
- >>> table_path = "file:/path/to/iceberg-table/metadata.json"
73
- >>> pl.scan_iceberg(table_path).collect() # doctest: +SKIP
74
-
75
- Creates a scan for an Iceberg table from S3.
76
- See a list of supported storage options for S3 `here
77
- <https://py.iceberg.apache.org/configuration/#fileio>`__.
78
-
79
- >>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
80
- >>> storage_options = {
81
- ... "s3.region": "eu-central-1",
82
- ... "s3.access-key-id": "THE_AWS_ACCESS_KEY_ID",
83
- ... "s3.secret-access-key": "THE_AWS_SECRET_ACCESS_KEY",
84
- ... }
85
- >>> pl.scan_iceberg(
86
- ... table_path, storage_options=storage_options
87
- ... ).collect() # doctest: +SKIP
88
-
89
- Creates a scan for an Iceberg table from Azure.
90
- Supported options for Azure are available `here
91
- <https://py.iceberg.apache.org/configuration/#azure-data-lake>`__.
92
-
93
- Following type of table paths are supported:
94
-
95
- * az://<container>/<path>/metadata.json
96
- * adl://<container>/<path>/metadata.json
97
- * abfs[s]://<container>/<path>/metadata.json
98
-
99
- >>> table_path = "az://container/path/to/iceberg-table/metadata.json"
100
- >>> storage_options = {
101
- ... "adlfs.account-name": "AZURE_STORAGE_ACCOUNT_NAME",
102
- ... "adlfs.account-key": "AZURE_STORAGE_ACCOUNT_KEY",
103
- ... }
104
- >>> pl.scan_iceberg(
105
- ... table_path, storage_options=storage_options
106
- ... ).collect() # doctest: +SKIP
107
-
108
- Creates a scan for an Iceberg table from Google Cloud Storage.
109
- Supported options for GCS are available `here
110
- <https://py.iceberg.apache.org/configuration/#google-cloud-storage>`__.
111
-
112
- >>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
113
- >>> storage_options = {
114
- ... "gcs.project-id": "my-gcp-project",
115
- ... "gcs.oauth.token": "ya29.dr.AfM...",
116
- ... }
117
- >>> pl.scan_iceberg(
118
- ... table_path, storage_options=storage_options
119
- ... ).collect() # doctest: +SKIP
120
-
121
- Creates a scan for an Iceberg table with additional options.
122
- In the below example, `without_files` option is used which loads the table without
123
- file tracking information.
124
-
125
- >>> table_path = "/path/to/iceberg-table/metadata.json"
126
- >>> storage_options = {"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"}
127
- >>> pl.scan_iceberg(
128
- ... table_path, storage_options=storage_options
129
- ... ).collect() # doctest: +SKIP
130
-
131
- Creates a scan for an Iceberg table using a specific snapshot ID.
132
-
133
- >>> table_path = "/path/to/iceberg-table/metadata.json"
134
- >>> snapshot_id = 7051579356916758811
135
- >>> pl.scan_iceberg(table_path, snapshot_id=snapshot_id).collect() # doctest: +SKIP
136
- """
137
- from polars._plr import PyLazyFrame
138
-
139
- if reader_override is not None:
140
- msg = "the `reader_override` parameter of `scan_iceberg()` is considered unstable."
141
- issue_unstable_warning(msg)
142
-
143
- dataset = IcebergDataset(
144
- source,
145
- snapshot_id=snapshot_id,
146
- iceberg_storage_properties=storage_options,
147
- reader_override=reader_override,
148
- use_metadata_statistics=use_metadata_statistics,
149
- )
150
-
151
- return wrap_ldf(PyLazyFrame.new_from_dataset_object(dataset))