polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
- polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/io/delta.py
DELETED
|
@@ -1,499 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import warnings
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
-
|
|
8
|
-
from polars._dependencies import _DELTALAKE_AVAILABLE, deltalake
|
|
9
|
-
from polars.datatypes import Null, Time
|
|
10
|
-
from polars.datatypes.convert import unpack_dtypes
|
|
11
|
-
from polars.io.cloud._utils import _get_path_scheme
|
|
12
|
-
from polars.io.parquet import scan_parquet
|
|
13
|
-
from polars.io.pyarrow_dataset.functions import scan_pyarrow_dataset
|
|
14
|
-
from polars.io.scan_options.cast_options import ScanCastOptions
|
|
15
|
-
from polars.schema import Schema
|
|
16
|
-
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
from typing import Literal
|
|
19
|
-
|
|
20
|
-
from deltalake import DeltaTable
|
|
21
|
-
|
|
22
|
-
from polars import DataFrame, DataType, LazyFrame
|
|
23
|
-
from polars.io.cloud import CredentialProviderFunction
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def read_delta(
|
|
27
|
-
source: str | Path | DeltaTable,
|
|
28
|
-
*,
|
|
29
|
-
version: int | str | datetime | None = None,
|
|
30
|
-
columns: list[str] | None = None,
|
|
31
|
-
rechunk: bool | None = None,
|
|
32
|
-
storage_options: dict[str, Any] | None = None,
|
|
33
|
-
credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
|
|
34
|
-
delta_table_options: dict[str, Any] | None = None,
|
|
35
|
-
use_pyarrow: bool = False,
|
|
36
|
-
pyarrow_options: dict[str, Any] | None = None,
|
|
37
|
-
) -> DataFrame:
|
|
38
|
-
"""
|
|
39
|
-
Reads into a DataFrame from a Delta lake table.
|
|
40
|
-
|
|
41
|
-
Parameters
|
|
42
|
-
----------
|
|
43
|
-
source
|
|
44
|
-
DeltaTable or a Path or URI to the root of the Delta lake table.
|
|
45
|
-
|
|
46
|
-
Note: For Local filesystem, absolute and relative paths are supported but
|
|
47
|
-
for the supported object storages - GCS, Azure and S3 full URI must be provided.
|
|
48
|
-
version
|
|
49
|
-
Numerical version or timestamp version of the Delta lake table.
|
|
50
|
-
|
|
51
|
-
Note: If `version` is not provided, the latest version of delta lake
|
|
52
|
-
table is read.
|
|
53
|
-
columns
|
|
54
|
-
Columns to select. Accepts a list of column names.
|
|
55
|
-
rechunk
|
|
56
|
-
Make sure that all columns are contiguous in memory by
|
|
57
|
-
aggregating the chunks into a single array.
|
|
58
|
-
storage_options
|
|
59
|
-
Extra options for the storage backends supported by `deltalake`.
|
|
60
|
-
For cloud storages, this may include configurations for authentication etc.
|
|
61
|
-
|
|
62
|
-
More info is available `here
|
|
63
|
-
<https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
|
|
64
|
-
credential_provider
|
|
65
|
-
Provide a function that can be called to provide cloud storage
|
|
66
|
-
credentials. The function is expected to return a dictionary of
|
|
67
|
-
credential keys along with an optional credential expiry time.
|
|
68
|
-
|
|
69
|
-
.. warning::
|
|
70
|
-
This functionality is considered **unstable**. It may be changed
|
|
71
|
-
at any point without it being considered a breaking change.
|
|
72
|
-
delta_table_options
|
|
73
|
-
Additional keyword arguments while reading a Delta lake Table.
|
|
74
|
-
use_pyarrow
|
|
75
|
-
Flag to enable pyarrow dataset reads.
|
|
76
|
-
pyarrow_options
|
|
77
|
-
Keyword arguments while converting a Delta lake Table to pyarrow table.
|
|
78
|
-
|
|
79
|
-
Returns
|
|
80
|
-
-------
|
|
81
|
-
DataFrame
|
|
82
|
-
|
|
83
|
-
Examples
|
|
84
|
-
--------
|
|
85
|
-
Reads a Delta table from local filesystem.
|
|
86
|
-
Note: Since version is not provided, the latest version of the delta table is read.
|
|
87
|
-
|
|
88
|
-
>>> table_path = "/path/to/delta-table/"
|
|
89
|
-
>>> pl.read_delta(table_path) # doctest: +SKIP
|
|
90
|
-
|
|
91
|
-
Reads a specific version of the Delta table from local filesystem.
|
|
92
|
-
Note: This will fail if the provided version of the delta table does not exist.
|
|
93
|
-
|
|
94
|
-
>>> pl.read_delta(table_path, version=1) # doctest: +SKIP
|
|
95
|
-
|
|
96
|
-
Time travel a delta table from local filesystem using a timestamp version.
|
|
97
|
-
|
|
98
|
-
>>> pl.read_delta(
|
|
99
|
-
... table_path, version=datetime(2020, 1, 1, tzinfo=timezone.utc)
|
|
100
|
-
... ) # doctest: +SKIP
|
|
101
|
-
|
|
102
|
-
Reads a Delta table from AWS S3.
|
|
103
|
-
See a list of supported storage options for S3 `here
|
|
104
|
-
<https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
|
|
105
|
-
|
|
106
|
-
>>> table_path = "s3://bucket/path/to/delta-table/"
|
|
107
|
-
>>> storage_options = {
|
|
108
|
-
... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
|
|
109
|
-
... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
|
|
110
|
-
... }
|
|
111
|
-
>>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
|
|
112
|
-
|
|
113
|
-
Reads a Delta table from Google Cloud storage (GCS).
|
|
114
|
-
See a list of supported storage options for GCS `here
|
|
115
|
-
<https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
|
|
116
|
-
|
|
117
|
-
>>> table_path = "gs://bucket/path/to/delta-table/"
|
|
118
|
-
>>> storage_options = {"SERVICE_ACCOUNT": "SERVICE_ACCOUNT_JSON_ABSOLUTE_PATH"}
|
|
119
|
-
>>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
|
|
120
|
-
|
|
121
|
-
Reads a Delta table from Azure.
|
|
122
|
-
|
|
123
|
-
Following type of table paths are supported,
|
|
124
|
-
|
|
125
|
-
* az://<container>/<path>
|
|
126
|
-
* adl://<container>/<path>
|
|
127
|
-
* abfs://<container>/<path>
|
|
128
|
-
|
|
129
|
-
See a list of supported storage options for Azure `here
|
|
130
|
-
<https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
|
|
131
|
-
|
|
132
|
-
>>> table_path = "az://container/path/to/delta-table/"
|
|
133
|
-
>>> storage_options = {
|
|
134
|
-
... "AZURE_STORAGE_ACCOUNT_NAME": "AZURE_STORAGE_ACCOUNT_NAME",
|
|
135
|
-
... "AZURE_STORAGE_ACCOUNT_KEY": "AZURE_STORAGE_ACCOUNT_KEY",
|
|
136
|
-
... }
|
|
137
|
-
>>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
|
|
138
|
-
|
|
139
|
-
Reads a Delta table with additional delta specific options. In the below example,
|
|
140
|
-
`without_files` option is used which loads the table without file tracking
|
|
141
|
-
information.
|
|
142
|
-
|
|
143
|
-
>>> table_path = "/path/to/delta-table/"
|
|
144
|
-
>>> delta_table_options = {"without_files": True}
|
|
145
|
-
>>> pl.read_delta(
|
|
146
|
-
... table_path, delta_table_options=delta_table_options
|
|
147
|
-
... ) # doctest: +SKIP
|
|
148
|
-
"""
|
|
149
|
-
df = scan_delta(
|
|
150
|
-
source=source,
|
|
151
|
-
version=version,
|
|
152
|
-
storage_options=storage_options,
|
|
153
|
-
credential_provider=credential_provider,
|
|
154
|
-
delta_table_options=delta_table_options,
|
|
155
|
-
use_pyarrow=use_pyarrow,
|
|
156
|
-
pyarrow_options=pyarrow_options,
|
|
157
|
-
rechunk=rechunk,
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
if columns is not None:
|
|
161
|
-
df = df.select(columns)
|
|
162
|
-
return df.collect()
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
def scan_delta(
|
|
166
|
-
source: str | Path | DeltaTable,
|
|
167
|
-
*,
|
|
168
|
-
version: int | str | datetime | None = None,
|
|
169
|
-
storage_options: dict[str, Any] | None = None,
|
|
170
|
-
credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
|
|
171
|
-
delta_table_options: dict[str, Any] | None = None,
|
|
172
|
-
use_pyarrow: bool = False,
|
|
173
|
-
pyarrow_options: dict[str, Any] | None = None,
|
|
174
|
-
rechunk: bool | None = None,
|
|
175
|
-
) -> LazyFrame:
|
|
176
|
-
"""
|
|
177
|
-
Lazily read from a Delta lake table.
|
|
178
|
-
|
|
179
|
-
Parameters
|
|
180
|
-
----------
|
|
181
|
-
source
|
|
182
|
-
DeltaTable or a Path or URI to the root of the Delta lake table.
|
|
183
|
-
|
|
184
|
-
Note: For Local filesystem, absolute and relative paths are supported but
|
|
185
|
-
for the supported object storages - GCS, Azure and S3 full URI must be provided.
|
|
186
|
-
version
|
|
187
|
-
Numerical version or timestamp version of the Delta lake table.
|
|
188
|
-
|
|
189
|
-
Note: If `version` is not provided, the latest version of delta lake
|
|
190
|
-
table is read.
|
|
191
|
-
storage_options
|
|
192
|
-
Extra options for the storage backends supported by `deltalake`.
|
|
193
|
-
For cloud storages, this may include configurations for authentication etc.
|
|
194
|
-
|
|
195
|
-
More info is available `here
|
|
196
|
-
<https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
|
|
197
|
-
credential_provider
|
|
198
|
-
Provide a function that can be called to provide cloud storage
|
|
199
|
-
credentials. The function is expected to return a dictionary of
|
|
200
|
-
credential keys along with an optional credential expiry time.
|
|
201
|
-
|
|
202
|
-
.. warning::
|
|
203
|
-
This functionality is considered **unstable**. It may be changed
|
|
204
|
-
at any point without it being considered a breaking change.
|
|
205
|
-
delta_table_options
|
|
206
|
-
Additional keyword arguments while reading a Delta lake Table.
|
|
207
|
-
use_pyarrow
|
|
208
|
-
Flag to enable pyarrow dataset reads.
|
|
209
|
-
pyarrow_options
|
|
210
|
-
Keyword arguments while converting a Delta lake Table to pyarrow table.
|
|
211
|
-
Use this parameter when filtering on partitioned columns or to read
|
|
212
|
-
from a 'fsspec' supported filesystem.
|
|
213
|
-
rechunk
|
|
214
|
-
Make sure that all columns are contiguous in memory by
|
|
215
|
-
aggregating the chunks into a single array.
|
|
216
|
-
|
|
217
|
-
Returns
|
|
218
|
-
-------
|
|
219
|
-
LazyFrame
|
|
220
|
-
|
|
221
|
-
Examples
|
|
222
|
-
--------
|
|
223
|
-
Creates a scan for a Delta table from local filesystem.
|
|
224
|
-
Note: Since version is not provided, the latest version of the delta table is read.
|
|
225
|
-
|
|
226
|
-
>>> table_path = "/path/to/delta-table/"
|
|
227
|
-
>>> pl.scan_delta(table_path).collect() # doctest: +SKIP
|
|
228
|
-
|
|
229
|
-
Creates a scan for a specific version of the Delta table from local filesystem.
|
|
230
|
-
Note: This will fail if the provided version of the delta table does not exist.
|
|
231
|
-
|
|
232
|
-
>>> pl.scan_delta(table_path, version=1).collect() # doctest: +SKIP
|
|
233
|
-
|
|
234
|
-
Time travel a delta table from local filesystem using a timestamp version.
|
|
235
|
-
|
|
236
|
-
>>> pl.scan_delta(
|
|
237
|
-
... table_path, version=datetime(2020, 1, 1, tzinfo=timezone.utc)
|
|
238
|
-
... ).collect() # doctest: +SKIP
|
|
239
|
-
|
|
240
|
-
Creates a scan for a Delta table from AWS S3.
|
|
241
|
-
See a list of supported storage options for S3 `here
|
|
242
|
-
<https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
|
|
243
|
-
|
|
244
|
-
>>> table_path = "s3://bucket/path/to/delta-table/"
|
|
245
|
-
>>> storage_options = {
|
|
246
|
-
... "AWS_REGION": "eu-central-1",
|
|
247
|
-
... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
|
|
248
|
-
... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
|
|
249
|
-
... }
|
|
250
|
-
>>> pl.scan_delta(
|
|
251
|
-
... table_path, storage_options=storage_options
|
|
252
|
-
... ).collect() # doctest: +SKIP
|
|
253
|
-
|
|
254
|
-
Creates a scan for a Delta table from Google Cloud storage (GCS).
|
|
255
|
-
See a list of supported storage options for GCS `here
|
|
256
|
-
<https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
|
|
257
|
-
|
|
258
|
-
>>> table_path = "gs://bucket/path/to/delta-table/"
|
|
259
|
-
>>> storage_options = {"SERVICE_ACCOUNT": "SERVICE_ACCOUNT_JSON_ABSOLUTE_PATH"}
|
|
260
|
-
>>> pl.scan_delta(
|
|
261
|
-
... table_path, storage_options=storage_options
|
|
262
|
-
... ).collect() # doctest: +SKIP
|
|
263
|
-
|
|
264
|
-
Creates a scan for a Delta table from Azure.
|
|
265
|
-
Supported options for Azure are available `here
|
|
266
|
-
<https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
|
|
267
|
-
|
|
268
|
-
Following type of table paths are supported,
|
|
269
|
-
|
|
270
|
-
* az://<container>/<path>
|
|
271
|
-
* adl://<container>/<path>
|
|
272
|
-
* abfs[s]://<container>/<path>
|
|
273
|
-
|
|
274
|
-
>>> table_path = "az://container/path/to/delta-table/"
|
|
275
|
-
>>> storage_options = {
|
|
276
|
-
... "AZURE_STORAGE_ACCOUNT_NAME": "AZURE_STORAGE_ACCOUNT_NAME",
|
|
277
|
-
... "AZURE_STORAGE_ACCOUNT_KEY": "AZURE_STORAGE_ACCOUNT_KEY",
|
|
278
|
-
... }
|
|
279
|
-
>>> pl.scan_delta(
|
|
280
|
-
... table_path, storage_options=storage_options
|
|
281
|
-
... ).collect() # doctest: +SKIP
|
|
282
|
-
|
|
283
|
-
Creates a scan for a Delta table with additional delta specific options.
|
|
284
|
-
In the below example, `without_files` option is used which loads the table without
|
|
285
|
-
file tracking information.
|
|
286
|
-
|
|
287
|
-
>>> table_path = "/path/to/delta-table/"
|
|
288
|
-
>>> delta_table_options = {"without_files": True}
|
|
289
|
-
>>> pl.scan_delta(
|
|
290
|
-
... table_path, delta_table_options=delta_table_options
|
|
291
|
-
... ).collect() # doctest: +SKIP
|
|
292
|
-
"""
|
|
293
|
-
_check_if_delta_available()
|
|
294
|
-
|
|
295
|
-
credential_provider_creds = {}
|
|
296
|
-
|
|
297
|
-
from deltalake import DeltaTable
|
|
298
|
-
|
|
299
|
-
from polars.io.cloud.credential_provider._builder import (
|
|
300
|
-
_init_credential_provider_builder,
|
|
301
|
-
)
|
|
302
|
-
from polars.io.cloud.credential_provider._providers import (
|
|
303
|
-
_get_credentials_from_provider_expiry_aware,
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
if not isinstance(source, DeltaTable):
|
|
307
|
-
credential_provider_builder = _init_credential_provider_builder(
|
|
308
|
-
credential_provider, source, storage_options, "scan_delta"
|
|
309
|
-
)
|
|
310
|
-
elif credential_provider is not None and credential_provider != "auto":
|
|
311
|
-
msg = "cannot use credential_provider when passing a DeltaTable object"
|
|
312
|
-
raise ValueError(msg)
|
|
313
|
-
else:
|
|
314
|
-
credential_provider_builder = None
|
|
315
|
-
|
|
316
|
-
del credential_provider
|
|
317
|
-
|
|
318
|
-
if credential_provider_builder and (
|
|
319
|
-
provider := credential_provider_builder.build_credential_provider()
|
|
320
|
-
):
|
|
321
|
-
credential_provider_creds = (
|
|
322
|
-
_get_credentials_from_provider_expiry_aware(provider) or {}
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
dl_tbl = _get_delta_lake_table(
|
|
326
|
-
table_path=source,
|
|
327
|
-
version=version,
|
|
328
|
-
storage_options=(
|
|
329
|
-
{**(storage_options or {}), **credential_provider_creds}
|
|
330
|
-
if storage_options is not None or credential_provider_builder is not None
|
|
331
|
-
else None
|
|
332
|
-
),
|
|
333
|
-
delta_table_options=delta_table_options,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
if isinstance(source, DeltaTable) and (
|
|
337
|
-
source._storage_options is not None or storage_options is not None
|
|
338
|
-
):
|
|
339
|
-
storage_options = {**(source._storage_options or {}), **(storage_options or {})}
|
|
340
|
-
|
|
341
|
-
if use_pyarrow:
|
|
342
|
-
pyarrow_options = pyarrow_options or {}
|
|
343
|
-
pa_ds = dl_tbl.to_pyarrow_dataset(**pyarrow_options)
|
|
344
|
-
return scan_pyarrow_dataset(pa_ds)
|
|
345
|
-
|
|
346
|
-
if pyarrow_options is not None:
|
|
347
|
-
msg = "To make use of pyarrow_options, set use_pyarrow to True"
|
|
348
|
-
raise ValueError(msg)
|
|
349
|
-
|
|
350
|
-
from deltalake.exceptions import DeltaProtocolError
|
|
351
|
-
from deltalake.table import (
|
|
352
|
-
MAX_SUPPORTED_READER_VERSION,
|
|
353
|
-
NOT_SUPPORTED_READER_VERSION,
|
|
354
|
-
SUPPORTED_READER_FEATURES,
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
table_protocol = dl_tbl.protocol()
|
|
358
|
-
if (
|
|
359
|
-
table_protocol.min_reader_version > MAX_SUPPORTED_READER_VERSION
|
|
360
|
-
or table_protocol.min_reader_version == NOT_SUPPORTED_READER_VERSION
|
|
361
|
-
):
|
|
362
|
-
msg = (
|
|
363
|
-
f"The table's minimum reader version is {table_protocol.min_reader_version} "
|
|
364
|
-
f"but polars delta scanner only supports version 1 or {MAX_SUPPORTED_READER_VERSION} with these reader features: {SUPPORTED_READER_FEATURES}"
|
|
365
|
-
)
|
|
366
|
-
raise DeltaProtocolError(msg)
|
|
367
|
-
if (
|
|
368
|
-
table_protocol.min_reader_version >= 3
|
|
369
|
-
and table_protocol.reader_features is not None
|
|
370
|
-
):
|
|
371
|
-
missing_features = {*table_protocol.reader_features}.difference(
|
|
372
|
-
SUPPORTED_READER_FEATURES
|
|
373
|
-
)
|
|
374
|
-
if len(missing_features) > 0:
|
|
375
|
-
msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the polars delta scanner."
|
|
376
|
-
raise DeltaProtocolError(msg)
|
|
377
|
-
|
|
378
|
-
delta_schema = dl_tbl.schema()
|
|
379
|
-
polars_schema = Schema(delta_schema)
|
|
380
|
-
partition_columns = dl_tbl.metadata().partition_columns
|
|
381
|
-
|
|
382
|
-
def _split_schema(
|
|
383
|
-
schema: Schema, partition_columns: list[str]
|
|
384
|
-
) -> tuple[Schema, Schema]:
|
|
385
|
-
if len(partition_columns) == 0:
|
|
386
|
-
return schema, Schema([])
|
|
387
|
-
main_schema = []
|
|
388
|
-
hive_schema = []
|
|
389
|
-
|
|
390
|
-
for name, dtype in schema.items():
|
|
391
|
-
if name in partition_columns:
|
|
392
|
-
hive_schema.append((name, dtype))
|
|
393
|
-
else:
|
|
394
|
-
main_schema.append((name, dtype))
|
|
395
|
-
|
|
396
|
-
return Schema(main_schema), Schema(hive_schema)
|
|
397
|
-
|
|
398
|
-
# Required because main_schema cannot contain hive columns currently
|
|
399
|
-
main_schema, hive_schema = _split_schema(polars_schema, partition_columns)
|
|
400
|
-
|
|
401
|
-
file_uris = dl_tbl.file_uris()
|
|
402
|
-
|
|
403
|
-
# LakeFS has an S3 compatible API, for reading therefore it's safe to do this.
|
|
404
|
-
# Deltalake internally has an integration for writing commits
|
|
405
|
-
if dl_tbl.table_uri.startswith("lakefs://"):
|
|
406
|
-
file_uris = [file_uri.replace("lakefs://", "s3://") for file_uri in file_uris]
|
|
407
|
-
|
|
408
|
-
return scan_parquet(
|
|
409
|
-
file_uris,
|
|
410
|
-
schema=main_schema,
|
|
411
|
-
hive_schema=hive_schema if len(partition_columns) > 0 else None,
|
|
412
|
-
cast_options=ScanCastOptions._default_iceberg(),
|
|
413
|
-
missing_columns="insert",
|
|
414
|
-
extra_columns="ignore",
|
|
415
|
-
hive_partitioning=len(partition_columns) > 0,
|
|
416
|
-
storage_options=storage_options,
|
|
417
|
-
credential_provider=credential_provider_builder, # type: ignore[arg-type]
|
|
418
|
-
rechunk=rechunk or False,
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
def _resolve_delta_lake_uri(table_uri: str | Path, *, strict: bool = True) -> str:
|
|
423
|
-
resolved_uri = str(
|
|
424
|
-
Path(table_uri).expanduser().resolve(strict)
|
|
425
|
-
if _get_path_scheme(table_uri) is None
|
|
426
|
-
else table_uri
|
|
427
|
-
)
|
|
428
|
-
|
|
429
|
-
return resolved_uri
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def _get_delta_lake_table(
|
|
433
|
-
table_path: str | Path | DeltaTable,
|
|
434
|
-
version: int | str | datetime | None = None,
|
|
435
|
-
storage_options: dict[str, Any] | None = None,
|
|
436
|
-
delta_table_options: dict[str, Any] | None = None,
|
|
437
|
-
) -> deltalake.DeltaTable:
|
|
438
|
-
"""
|
|
439
|
-
Initialize a Delta lake table for use in read and scan operations.
|
|
440
|
-
|
|
441
|
-
Notes
|
|
442
|
-
-----
|
|
443
|
-
Make sure to install deltalake>=0.8.0. Read the documentation
|
|
444
|
-
`here <https://delta-io.github.io/delta-rs/usage/installation/>`_.
|
|
445
|
-
"""
|
|
446
|
-
_check_if_delta_available()
|
|
447
|
-
|
|
448
|
-
if isinstance(table_path, deltalake.DeltaTable):
|
|
449
|
-
if any(
|
|
450
|
-
[
|
|
451
|
-
version is not None,
|
|
452
|
-
storage_options is not None,
|
|
453
|
-
delta_table_options is not None,
|
|
454
|
-
]
|
|
455
|
-
):
|
|
456
|
-
warnings.warn(
|
|
457
|
-
"""When supplying a DeltaTable directly, `version`, `storage_options`, and `delta_table_options` are ignored.
|
|
458
|
-
To silence this warning, don't supply those parameters.""",
|
|
459
|
-
RuntimeWarning,
|
|
460
|
-
stacklevel=1,
|
|
461
|
-
)
|
|
462
|
-
return table_path
|
|
463
|
-
if delta_table_options is None:
|
|
464
|
-
delta_table_options = {}
|
|
465
|
-
resolved_uri = _resolve_delta_lake_uri(table_path)
|
|
466
|
-
if not isinstance(version, (str, datetime)):
|
|
467
|
-
dl_tbl = deltalake.DeltaTable(
|
|
468
|
-
resolved_uri,
|
|
469
|
-
version=version,
|
|
470
|
-
storage_options=storage_options,
|
|
471
|
-
**delta_table_options,
|
|
472
|
-
)
|
|
473
|
-
else:
|
|
474
|
-
dl_tbl = deltalake.DeltaTable(
|
|
475
|
-
table_path,
|
|
476
|
-
storage_options=storage_options,
|
|
477
|
-
**delta_table_options,
|
|
478
|
-
)
|
|
479
|
-
dl_tbl.load_as_version(version)
|
|
480
|
-
|
|
481
|
-
return dl_tbl
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
def _check_if_delta_available() -> None:
|
|
485
|
-
if not _DELTALAKE_AVAILABLE:
|
|
486
|
-
msg = "deltalake is not installed\n\nPlease run: pip install deltalake"
|
|
487
|
-
raise ModuleNotFoundError(msg)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
def _check_for_unsupported_types(dtypes: list[DataType]) -> None:
|
|
491
|
-
schema_dtypes = unpack_dtypes(*dtypes)
|
|
492
|
-
unsupported_types = {Time, Null}
|
|
493
|
-
# Note that this overlap check does NOT work correctly for Categorical, so
|
|
494
|
-
# if Categorical is added back to unsupported_types a different check will
|
|
495
|
-
# need to be used.
|
|
496
|
-
|
|
497
|
-
if overlap := schema_dtypes & unsupported_types:
|
|
498
|
-
msg = f"dataframe contains unsupported data types: {overlap!r}"
|
|
499
|
-
raise TypeError(msg)
|
polars/io/iceberg/__init__.py
DELETED