polars-runtime-compat 1.34.0b2__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/.gitkeep +0 -0
  2. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  3. polars/__init__.py +528 -0
  4. polars/_cpu_check.py +265 -0
  5. polars/_dependencies.py +355 -0
  6. polars/_plr.py +99 -0
  7. polars/_plr.pyi +2496 -0
  8. polars/_reexport.py +23 -0
  9. polars/_typing.py +478 -0
  10. polars/_utils/__init__.py +37 -0
  11. polars/_utils/async_.py +102 -0
  12. polars/_utils/cache.py +176 -0
  13. polars/_utils/cloud.py +40 -0
  14. polars/_utils/constants.py +29 -0
  15. polars/_utils/construction/__init__.py +46 -0
  16. polars/_utils/construction/dataframe.py +1397 -0
  17. polars/_utils/construction/other.py +72 -0
  18. polars/_utils/construction/series.py +560 -0
  19. polars/_utils/construction/utils.py +118 -0
  20. polars/_utils/convert.py +224 -0
  21. polars/_utils/deprecation.py +406 -0
  22. polars/_utils/getitem.py +457 -0
  23. polars/_utils/logging.py +11 -0
  24. polars/_utils/nest_asyncio.py +264 -0
  25. polars/_utils/parquet.py +15 -0
  26. polars/_utils/parse/__init__.py +12 -0
  27. polars/_utils/parse/expr.py +242 -0
  28. polars/_utils/polars_version.py +19 -0
  29. polars/_utils/pycapsule.py +53 -0
  30. polars/_utils/scan.py +27 -0
  31. polars/_utils/serde.py +63 -0
  32. polars/_utils/slice.py +215 -0
  33. polars/_utils/udfs.py +1251 -0
  34. polars/_utils/unstable.py +63 -0
  35. polars/_utils/various.py +782 -0
  36. polars/_utils/wrap.py +25 -0
  37. polars/api.py +370 -0
  38. polars/catalog/__init__.py +0 -0
  39. polars/catalog/unity/__init__.py +19 -0
  40. polars/catalog/unity/client.py +733 -0
  41. polars/catalog/unity/models.py +152 -0
  42. polars/config.py +1571 -0
  43. polars/convert/__init__.py +25 -0
  44. polars/convert/general.py +1046 -0
  45. polars/convert/normalize.py +261 -0
  46. polars/dataframe/__init__.py +5 -0
  47. polars/dataframe/_html.py +186 -0
  48. polars/dataframe/frame.py +12582 -0
  49. polars/dataframe/group_by.py +1067 -0
  50. polars/dataframe/plotting.py +257 -0
  51. polars/datatype_expr/__init__.py +5 -0
  52. polars/datatype_expr/array.py +56 -0
  53. polars/datatype_expr/datatype_expr.py +304 -0
  54. polars/datatype_expr/list.py +18 -0
  55. polars/datatype_expr/struct.py +69 -0
  56. polars/datatypes/__init__.py +122 -0
  57. polars/datatypes/_parse.py +195 -0
  58. polars/datatypes/_utils.py +48 -0
  59. polars/datatypes/classes.py +1213 -0
  60. polars/datatypes/constants.py +11 -0
  61. polars/datatypes/constructor.py +172 -0
  62. polars/datatypes/convert.py +366 -0
  63. polars/datatypes/group.py +130 -0
  64. polars/exceptions.py +230 -0
  65. polars/expr/__init__.py +7 -0
  66. polars/expr/array.py +964 -0
  67. polars/expr/binary.py +346 -0
  68. polars/expr/categorical.py +306 -0
  69. polars/expr/datetime.py +2620 -0
  70. polars/expr/expr.py +11272 -0
  71. polars/expr/list.py +1408 -0
  72. polars/expr/meta.py +444 -0
  73. polars/expr/name.py +321 -0
  74. polars/expr/string.py +3045 -0
  75. polars/expr/struct.py +357 -0
  76. polars/expr/whenthen.py +185 -0
  77. polars/functions/__init__.py +193 -0
  78. polars/functions/aggregation/__init__.py +33 -0
  79. polars/functions/aggregation/horizontal.py +298 -0
  80. polars/functions/aggregation/vertical.py +341 -0
  81. polars/functions/as_datatype.py +848 -0
  82. polars/functions/business.py +138 -0
  83. polars/functions/col.py +384 -0
  84. polars/functions/datatype.py +121 -0
  85. polars/functions/eager.py +524 -0
  86. polars/functions/escape_regex.py +29 -0
  87. polars/functions/lazy.py +2751 -0
  88. polars/functions/len.py +68 -0
  89. polars/functions/lit.py +210 -0
  90. polars/functions/random.py +22 -0
  91. polars/functions/range/__init__.py +19 -0
  92. polars/functions/range/_utils.py +15 -0
  93. polars/functions/range/date_range.py +303 -0
  94. polars/functions/range/datetime_range.py +370 -0
  95. polars/functions/range/int_range.py +348 -0
  96. polars/functions/range/linear_space.py +311 -0
  97. polars/functions/range/time_range.py +287 -0
  98. polars/functions/repeat.py +301 -0
  99. polars/functions/whenthen.py +353 -0
  100. polars/interchange/__init__.py +10 -0
  101. polars/interchange/buffer.py +77 -0
  102. polars/interchange/column.py +190 -0
  103. polars/interchange/dataframe.py +230 -0
  104. polars/interchange/from_dataframe.py +328 -0
  105. polars/interchange/protocol.py +303 -0
  106. polars/interchange/utils.py +170 -0
  107. polars/io/__init__.py +64 -0
  108. polars/io/_utils.py +317 -0
  109. polars/io/avro.py +49 -0
  110. polars/io/clipboard.py +36 -0
  111. polars/io/cloud/__init__.py +17 -0
  112. polars/io/cloud/_utils.py +80 -0
  113. polars/io/cloud/credential_provider/__init__.py +17 -0
  114. polars/io/cloud/credential_provider/_builder.py +520 -0
  115. polars/io/cloud/credential_provider/_providers.py +618 -0
  116. polars/io/csv/__init__.py +9 -0
  117. polars/io/csv/_utils.py +38 -0
  118. polars/io/csv/batched_reader.py +142 -0
  119. polars/io/csv/functions.py +1495 -0
  120. polars/io/database/__init__.py +6 -0
  121. polars/io/database/_arrow_registry.py +70 -0
  122. polars/io/database/_cursor_proxies.py +147 -0
  123. polars/io/database/_executor.py +578 -0
  124. polars/io/database/_inference.py +314 -0
  125. polars/io/database/_utils.py +144 -0
  126. polars/io/database/functions.py +516 -0
  127. polars/io/delta.py +499 -0
  128. polars/io/iceberg/__init__.py +3 -0
  129. polars/io/iceberg/_utils.py +697 -0
  130. polars/io/iceberg/dataset.py +556 -0
  131. polars/io/iceberg/functions.py +151 -0
  132. polars/io/ipc/__init__.py +8 -0
  133. polars/io/ipc/functions.py +514 -0
  134. polars/io/json/__init__.py +3 -0
  135. polars/io/json/read.py +101 -0
  136. polars/io/ndjson.py +332 -0
  137. polars/io/parquet/__init__.py +17 -0
  138. polars/io/parquet/field_overwrites.py +140 -0
  139. polars/io/parquet/functions.py +722 -0
  140. polars/io/partition.py +491 -0
  141. polars/io/plugins.py +187 -0
  142. polars/io/pyarrow_dataset/__init__.py +5 -0
  143. polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
  144. polars/io/pyarrow_dataset/functions.py +79 -0
  145. polars/io/scan_options/__init__.py +5 -0
  146. polars/io/scan_options/_options.py +59 -0
  147. polars/io/scan_options/cast_options.py +126 -0
  148. polars/io/spreadsheet/__init__.py +6 -0
  149. polars/io/spreadsheet/_utils.py +52 -0
  150. polars/io/spreadsheet/_write_utils.py +647 -0
  151. polars/io/spreadsheet/functions.py +1323 -0
  152. polars/lazyframe/__init__.py +9 -0
  153. polars/lazyframe/engine_config.py +61 -0
  154. polars/lazyframe/frame.py +8564 -0
  155. polars/lazyframe/group_by.py +669 -0
  156. polars/lazyframe/in_process.py +42 -0
  157. polars/lazyframe/opt_flags.py +333 -0
  158. polars/meta/__init__.py +14 -0
  159. polars/meta/build.py +33 -0
  160. polars/meta/index_type.py +27 -0
  161. polars/meta/thread_pool.py +50 -0
  162. polars/meta/versions.py +120 -0
  163. polars/ml/__init__.py +0 -0
  164. polars/ml/torch.py +213 -0
  165. polars/ml/utilities.py +30 -0
  166. polars/plugins.py +155 -0
  167. polars/py.typed +0 -0
  168. polars/pyproject.toml +96 -0
  169. polars/schema.py +265 -0
  170. polars/selectors.py +3117 -0
  171. polars/series/__init__.py +5 -0
  172. polars/series/array.py +776 -0
  173. polars/series/binary.py +254 -0
  174. polars/series/categorical.py +246 -0
  175. polars/series/datetime.py +2275 -0
  176. polars/series/list.py +1087 -0
  177. polars/series/plotting.py +191 -0
  178. polars/series/series.py +9197 -0
  179. polars/series/string.py +2367 -0
  180. polars/series/struct.py +154 -0
  181. polars/series/utils.py +191 -0
  182. polars/sql/__init__.py +7 -0
  183. polars/sql/context.py +677 -0
  184. polars/sql/functions.py +139 -0
  185. polars/string_cache.py +185 -0
  186. polars/testing/__init__.py +13 -0
  187. polars/testing/asserts/__init__.py +9 -0
  188. polars/testing/asserts/frame.py +231 -0
  189. polars/testing/asserts/series.py +219 -0
  190. polars/testing/asserts/utils.py +12 -0
  191. polars/testing/parametric/__init__.py +33 -0
  192. polars/testing/parametric/profiles.py +107 -0
  193. polars/testing/parametric/strategies/__init__.py +22 -0
  194. polars/testing/parametric/strategies/_utils.py +14 -0
  195. polars/testing/parametric/strategies/core.py +615 -0
  196. polars/testing/parametric/strategies/data.py +452 -0
  197. polars/testing/parametric/strategies/dtype.py +436 -0
  198. polars/testing/parametric/strategies/legacy.py +169 -0
  199. polars/type_aliases.py +24 -0
  200. polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
  202. polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
  203. polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
polars/io/delta.py ADDED
@@ -0,0 +1,499 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from polars._dependencies import _DELTALAKE_AVAILABLE, deltalake
9
+ from polars.datatypes import Null, Time
10
+ from polars.datatypes.convert import unpack_dtypes
11
+ from polars.io.cloud._utils import _get_path_scheme
12
+ from polars.io.parquet import scan_parquet
13
+ from polars.io.pyarrow_dataset.functions import scan_pyarrow_dataset
14
+ from polars.io.scan_options.cast_options import ScanCastOptions
15
+ from polars.schema import Schema
16
+
17
+ if TYPE_CHECKING:
18
+ from typing import Literal
19
+
20
+ from deltalake import DeltaTable
21
+
22
+ from polars import DataFrame, DataType, LazyFrame
23
+ from polars.io.cloud import CredentialProviderFunction
24
+
25
+
26
+ def read_delta(
27
+ source: str | Path | DeltaTable,
28
+ *,
29
+ version: int | str | datetime | None = None,
30
+ columns: list[str] | None = None,
31
+ rechunk: bool | None = None,
32
+ storage_options: dict[str, Any] | None = None,
33
+ credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
34
+ delta_table_options: dict[str, Any] | None = None,
35
+ use_pyarrow: bool = False,
36
+ pyarrow_options: dict[str, Any] | None = None,
37
+ ) -> DataFrame:
38
+ """
39
+ Reads into a DataFrame from a Delta lake table.
40
+
41
+ Parameters
42
+ ----------
43
+ source
44
+ DeltaTable or a Path or URI to the root of the Delta lake table.
45
+
46
+ Note: For Local filesystem, absolute and relative paths are supported but
47
+ for the supported object storages - GCS, Azure and S3 full URI must be provided.
48
+ version
49
+ Numerical version or timestamp version of the Delta lake table.
50
+
51
+ Note: If `version` is not provided, the latest version of delta lake
52
+ table is read.
53
+ columns
54
+ Columns to select. Accepts a list of column names.
55
+ rechunk
56
+ Make sure that all columns are contiguous in memory by
57
+ aggregating the chunks into a single array.
58
+ storage_options
59
+ Extra options for the storage backends supported by `deltalake`.
60
+ For cloud storages, this may include configurations for authentication etc.
61
+
62
+ More info is available `here
63
+ <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
64
+ credential_provider
65
+ Provide a function that can be called to provide cloud storage
66
+ credentials. The function is expected to return a dictionary of
67
+ credential keys along with an optional credential expiry time.
68
+
69
+ .. warning::
70
+ This functionality is considered **unstable**. It may be changed
71
+ at any point without it being considered a breaking change.
72
+ delta_table_options
73
+ Additional keyword arguments while reading a Delta lake Table.
74
+ use_pyarrow
75
+ Flag to enable pyarrow dataset reads.
76
+ pyarrow_options
77
+ Keyword arguments while converting a Delta lake Table to pyarrow table.
78
+
79
+ Returns
80
+ -------
81
+ DataFrame
82
+
83
+ Examples
84
+ --------
85
+ Reads a Delta table from local filesystem.
86
+ Note: Since version is not provided, the latest version of the delta table is read.
87
+
88
+ >>> table_path = "/path/to/delta-table/"
89
+ >>> pl.read_delta(table_path) # doctest: +SKIP
90
+
91
+ Reads a specific version of the Delta table from local filesystem.
92
+ Note: This will fail if the provided version of the delta table does not exist.
93
+
94
+ >>> pl.read_delta(table_path, version=1) # doctest: +SKIP
95
+
96
+ Time travel a delta table from local filesystem using a timestamp version.
97
+
98
+ >>> pl.read_delta(
99
+ ... table_path, version=datetime(2020, 1, 1, tzinfo=timezone.utc)
100
+ ... ) # doctest: +SKIP
101
+
102
+ Reads a Delta table from AWS S3.
103
+ See a list of supported storage options for S3 `here
104
+ <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
105
+
106
+ >>> table_path = "s3://bucket/path/to/delta-table/"
107
+ >>> storage_options = {
108
+ ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
109
+ ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
110
+ ... }
111
+ >>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
112
+
113
+ Reads a Delta table from Google Cloud storage (GCS).
114
+ See a list of supported storage options for GCS `here
115
+ <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
116
+
117
+ >>> table_path = "gs://bucket/path/to/delta-table/"
118
+ >>> storage_options = {"SERVICE_ACCOUNT": "SERVICE_ACCOUNT_JSON_ABSOLUTE_PATH"}
119
+ >>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
120
+
121
+ Reads a Delta table from Azure.
122
+
123
+ Following type of table paths are supported,
124
+
125
+ * az://<container>/<path>
126
+ * adl://<container>/<path>
127
+ * abfs://<container>/<path>
128
+
129
+ See a list of supported storage options for Azure `here
130
+ <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
131
+
132
+ >>> table_path = "az://container/path/to/delta-table/"
133
+ >>> storage_options = {
134
+ ... "AZURE_STORAGE_ACCOUNT_NAME": "AZURE_STORAGE_ACCOUNT_NAME",
135
+ ... "AZURE_STORAGE_ACCOUNT_KEY": "AZURE_STORAGE_ACCOUNT_KEY",
136
+ ... }
137
+ >>> pl.read_delta(table_path, storage_options=storage_options) # doctest: +SKIP
138
+
139
+ Reads a Delta table with additional delta specific options. In the below example,
140
+ `without_files` option is used which loads the table without file tracking
141
+ information.
142
+
143
+ >>> table_path = "/path/to/delta-table/"
144
+ >>> delta_table_options = {"without_files": True}
145
+ >>> pl.read_delta(
146
+ ... table_path, delta_table_options=delta_table_options
147
+ ... ) # doctest: +SKIP
148
+ """
149
+ df = scan_delta(
150
+ source=source,
151
+ version=version,
152
+ storage_options=storage_options,
153
+ credential_provider=credential_provider,
154
+ delta_table_options=delta_table_options,
155
+ use_pyarrow=use_pyarrow,
156
+ pyarrow_options=pyarrow_options,
157
+ rechunk=rechunk,
158
+ )
159
+
160
+ if columns is not None:
161
+ df = df.select(columns)
162
+ return df.collect()
163
+
164
+
165
+ def scan_delta(
166
+ source: str | Path | DeltaTable,
167
+ *,
168
+ version: int | str | datetime | None = None,
169
+ storage_options: dict[str, Any] | None = None,
170
+ credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
171
+ delta_table_options: dict[str, Any] | None = None,
172
+ use_pyarrow: bool = False,
173
+ pyarrow_options: dict[str, Any] | None = None,
174
+ rechunk: bool | None = None,
175
+ ) -> LazyFrame:
176
+ """
177
+ Lazily read from a Delta lake table.
178
+
179
+ Parameters
180
+ ----------
181
+ source
182
+ DeltaTable or a Path or URI to the root of the Delta lake table.
183
+
184
+ Note: For Local filesystem, absolute and relative paths are supported but
185
+ for the supported object storages - GCS, Azure and S3 full URI must be provided.
186
+ version
187
+ Numerical version or timestamp version of the Delta lake table.
188
+
189
+ Note: If `version` is not provided, the latest version of delta lake
190
+ table is read.
191
+ storage_options
192
+ Extra options for the storage backends supported by `deltalake`.
193
+ For cloud storages, this may include configurations for authentication etc.
194
+
195
+ More info is available `here
196
+ <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
197
+ credential_provider
198
+ Provide a function that can be called to provide cloud storage
199
+ credentials. The function is expected to return a dictionary of
200
+ credential keys along with an optional credential expiry time.
201
+
202
+ .. warning::
203
+ This functionality is considered **unstable**. It may be changed
204
+ at any point without it being considered a breaking change.
205
+ delta_table_options
206
+ Additional keyword arguments while reading a Delta lake Table.
207
+ use_pyarrow
208
+ Flag to enable pyarrow dataset reads.
209
+ pyarrow_options
210
+ Keyword arguments while converting a Delta lake Table to pyarrow table.
211
+ Use this parameter when filtering on partitioned columns or to read
212
+ from a 'fsspec' supported filesystem.
213
+ rechunk
214
+ Make sure that all columns are contiguous in memory by
215
+ aggregating the chunks into a single array.
216
+
217
+ Returns
218
+ -------
219
+ LazyFrame
220
+
221
+ Examples
222
+ --------
223
+ Creates a scan for a Delta table from local filesystem.
224
+ Note: Since version is not provided, the latest version of the delta table is read.
225
+
226
+ >>> table_path = "/path/to/delta-table/"
227
+ >>> pl.scan_delta(table_path).collect() # doctest: +SKIP
228
+
229
+ Creates a scan for a specific version of the Delta table from local filesystem.
230
+ Note: This will fail if the provided version of the delta table does not exist.
231
+
232
+ >>> pl.scan_delta(table_path, version=1).collect() # doctest: +SKIP
233
+
234
+ Time travel a delta table from local filesystem using a timestamp version.
235
+
236
+ >>> pl.scan_delta(
237
+ ... table_path, version=datetime(2020, 1, 1, tzinfo=timezone.utc)
238
+ ... ).collect() # doctest: +SKIP
239
+
240
+ Creates a scan for a Delta table from AWS S3.
241
+ See a list of supported storage options for S3 `here
242
+ <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
243
+
244
+ >>> table_path = "s3://bucket/path/to/delta-table/"
245
+ >>> storage_options = {
246
+ ... "AWS_REGION": "eu-central-1",
247
+ ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
248
+ ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
249
+ ... }
250
+ >>> pl.scan_delta(
251
+ ... table_path, storage_options=storage_options
252
+ ... ).collect() # doctest: +SKIP
253
+
254
+ Creates a scan for a Delta table from Google Cloud storage (GCS).
255
+ See a list of supported storage options for GCS `here
256
+ <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
257
+
258
+ >>> table_path = "gs://bucket/path/to/delta-table/"
259
+ >>> storage_options = {"SERVICE_ACCOUNT": "SERVICE_ACCOUNT_JSON_ABSOLUTE_PATH"}
260
+ >>> pl.scan_delta(
261
+ ... table_path, storage_options=storage_options
262
+ ... ).collect() # doctest: +SKIP
263
+
264
+ Creates a scan for a Delta table from Azure.
265
+ Supported options for Azure are available `here
266
+ <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
267
+
268
+ Following type of table paths are supported,
269
+
270
+ * az://<container>/<path>
271
+ * adl://<container>/<path>
272
+ * abfs[s]://<container>/<path>
273
+
274
+ >>> table_path = "az://container/path/to/delta-table/"
275
+ >>> storage_options = {
276
+ ... "AZURE_STORAGE_ACCOUNT_NAME": "AZURE_STORAGE_ACCOUNT_NAME",
277
+ ... "AZURE_STORAGE_ACCOUNT_KEY": "AZURE_STORAGE_ACCOUNT_KEY",
278
+ ... }
279
+ >>> pl.scan_delta(
280
+ ... table_path, storage_options=storage_options
281
+ ... ).collect() # doctest: +SKIP
282
+
283
+ Creates a scan for a Delta table with additional delta specific options.
284
+ In the below example, `without_files` option is used which loads the table without
285
+ file tracking information.
286
+
287
+ >>> table_path = "/path/to/delta-table/"
288
+ >>> delta_table_options = {"without_files": True}
289
+ >>> pl.scan_delta(
290
+ ... table_path, delta_table_options=delta_table_options
291
+ ... ).collect() # doctest: +SKIP
292
+ """
293
+ _check_if_delta_available()
294
+
295
+ credential_provider_creds = {}
296
+
297
+ from deltalake import DeltaTable
298
+
299
+ from polars.io.cloud.credential_provider._builder import (
300
+ _init_credential_provider_builder,
301
+ )
302
+ from polars.io.cloud.credential_provider._providers import (
303
+ _get_credentials_from_provider_expiry_aware,
304
+ )
305
+
306
+ if not isinstance(source, DeltaTable):
307
+ credential_provider_builder = _init_credential_provider_builder(
308
+ credential_provider, source, storage_options, "scan_delta"
309
+ )
310
+ elif credential_provider is not None and credential_provider != "auto":
311
+ msg = "cannot use credential_provider when passing a DeltaTable object"
312
+ raise ValueError(msg)
313
+ else:
314
+ credential_provider_builder = None
315
+
316
+ del credential_provider
317
+
318
+ if credential_provider_builder and (
319
+ provider := credential_provider_builder.build_credential_provider()
320
+ ):
321
+ credential_provider_creds = (
322
+ _get_credentials_from_provider_expiry_aware(provider) or {}
323
+ )
324
+
325
+ dl_tbl = _get_delta_lake_table(
326
+ table_path=source,
327
+ version=version,
328
+ storage_options=(
329
+ {**(storage_options or {}), **credential_provider_creds}
330
+ if storage_options is not None or credential_provider_builder is not None
331
+ else None
332
+ ),
333
+ delta_table_options=delta_table_options,
334
+ )
335
+
336
+ if isinstance(source, DeltaTable) and (
337
+ source._storage_options is not None or storage_options is not None
338
+ ):
339
+ storage_options = {**(source._storage_options or {}), **(storage_options or {})}
340
+
341
+ if use_pyarrow:
342
+ pyarrow_options = pyarrow_options or {}
343
+ pa_ds = dl_tbl.to_pyarrow_dataset(**pyarrow_options)
344
+ return scan_pyarrow_dataset(pa_ds)
345
+
346
+ if pyarrow_options is not None:
347
+ msg = "To make use of pyarrow_options, set use_pyarrow to True"
348
+ raise ValueError(msg)
349
+
350
+ from deltalake.exceptions import DeltaProtocolError
351
+ from deltalake.table import (
352
+ MAX_SUPPORTED_READER_VERSION,
353
+ NOT_SUPPORTED_READER_VERSION,
354
+ SUPPORTED_READER_FEATURES,
355
+ )
356
+
357
+ table_protocol = dl_tbl.protocol()
358
+ if (
359
+ table_protocol.min_reader_version > MAX_SUPPORTED_READER_VERSION
360
+ or table_protocol.min_reader_version == NOT_SUPPORTED_READER_VERSION
361
+ ):
362
+ msg = (
363
+ f"The table's minimum reader version is {table_protocol.min_reader_version} "
364
+ f"but polars delta scanner only supports version 1 or {MAX_SUPPORTED_READER_VERSION} with these reader features: {SUPPORTED_READER_FEATURES}"
365
+ )
366
+ raise DeltaProtocolError(msg)
367
+ if (
368
+ table_protocol.min_reader_version >= 3
369
+ and table_protocol.reader_features is not None
370
+ ):
371
+ missing_features = {*table_protocol.reader_features}.difference(
372
+ SUPPORTED_READER_FEATURES
373
+ )
374
+ if len(missing_features) > 0:
375
+ msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the polars delta scanner."
376
+ raise DeltaProtocolError(msg)
377
+
378
+ delta_schema = dl_tbl.schema()
379
+ polars_schema = Schema(delta_schema)
380
+ partition_columns = dl_tbl.metadata().partition_columns
381
+
382
+ def _split_schema(
383
+ schema: Schema, partition_columns: list[str]
384
+ ) -> tuple[Schema, Schema]:
385
+ if len(partition_columns) == 0:
386
+ return schema, Schema([])
387
+ main_schema = []
388
+ hive_schema = []
389
+
390
+ for name, dtype in schema.items():
391
+ if name in partition_columns:
392
+ hive_schema.append((name, dtype))
393
+ else:
394
+ main_schema.append((name, dtype))
395
+
396
+ return Schema(main_schema), Schema(hive_schema)
397
+
398
+ # Required because main_schema cannot contain hive columns currently
399
+ main_schema, hive_schema = _split_schema(polars_schema, partition_columns)
400
+
401
+ file_uris = dl_tbl.file_uris()
402
+
403
+ # LakeFS has an S3 compatible API, for reading therefore it's safe to do this.
404
+ # Deltalake internally has an integration for writing commits
405
+ if dl_tbl.table_uri.startswith("lakefs://"):
406
+ file_uris = [file_uri.replace("lakefs://", "s3://") for file_uri in file_uris]
407
+
408
+ return scan_parquet(
409
+ file_uris,
410
+ schema=main_schema,
411
+ hive_schema=hive_schema if len(partition_columns) > 0 else None,
412
+ cast_options=ScanCastOptions._default_iceberg(),
413
+ missing_columns="insert",
414
+ extra_columns="ignore",
415
+ hive_partitioning=len(partition_columns) > 0,
416
+ storage_options=storage_options,
417
+ credential_provider=credential_provider_builder, # type: ignore[arg-type]
418
+ rechunk=rechunk or False,
419
+ )
420
+
421
+
422
+ def _resolve_delta_lake_uri(table_uri: str | Path, *, strict: bool = True) -> str:
423
+ resolved_uri = str(
424
+ Path(table_uri).expanduser().resolve(strict)
425
+ if _get_path_scheme(table_uri) is None
426
+ else table_uri
427
+ )
428
+
429
+ return resolved_uri
430
+
431
+
432
+ def _get_delta_lake_table(
433
+ table_path: str | Path | DeltaTable,
434
+ version: int | str | datetime | None = None,
435
+ storage_options: dict[str, Any] | None = None,
436
+ delta_table_options: dict[str, Any] | None = None,
437
+ ) -> deltalake.DeltaTable:
438
+ """
439
+ Initialize a Delta lake table for use in read and scan operations.
440
+
441
+ Notes
442
+ -----
443
+ Make sure to install deltalake>=0.8.0. Read the documentation
444
+ `here <https://delta-io.github.io/delta-rs/usage/installation/>`_.
445
+ """
446
+ _check_if_delta_available()
447
+
448
+ if isinstance(table_path, deltalake.DeltaTable):
449
+ if any(
450
+ [
451
+ version is not None,
452
+ storage_options is not None,
453
+ delta_table_options is not None,
454
+ ]
455
+ ):
456
+ warnings.warn(
457
+ """When supplying a DeltaTable directly, `version`, `storage_options`, and `delta_table_options` are ignored.
458
+ To silence this warning, don't supply those parameters.""",
459
+ RuntimeWarning,
460
+ stacklevel=1,
461
+ )
462
+ return table_path
463
+ if delta_table_options is None:
464
+ delta_table_options = {}
465
+ resolved_uri = _resolve_delta_lake_uri(table_path)
466
+ if not isinstance(version, (str, datetime)):
467
+ dl_tbl = deltalake.DeltaTable(
468
+ resolved_uri,
469
+ version=version,
470
+ storage_options=storage_options,
471
+ **delta_table_options,
472
+ )
473
+ else:
474
+ dl_tbl = deltalake.DeltaTable(
475
+ table_path,
476
+ storage_options=storage_options,
477
+ **delta_table_options,
478
+ )
479
+ dl_tbl.load_as_version(version)
480
+
481
+ return dl_tbl
482
+
483
+
484
+ def _check_if_delta_available() -> None:
485
+ if not _DELTALAKE_AVAILABLE:
486
+ msg = "deltalake is not installed\n\nPlease run: pip install deltalake"
487
+ raise ModuleNotFoundError(msg)
488
+
489
+
490
+ def _check_for_unsupported_types(dtypes: list[DataType]) -> None:
491
+ schema_dtypes = unpack_dtypes(*dtypes)
492
+ unsupported_types = {Time, Null}
493
+ # Note that this overlap check does NOT work correctly for Categorical, so
494
+ # if Categorical is added back to unsupported_types a different check will
495
+ # need to be used.
496
+
497
+ if overlap := schema_dtypes & unsupported_types:
498
+ msg = f"dataframe contains unsupported data types: {overlap!r}"
499
+ raise TypeError(msg)
@@ -0,0 +1,3 @@
1
+ from polars.io.iceberg.functions import scan_iceberg
2
+
3
+ __all__ = ["scan_iceberg"]