polars-runtime-compat 1.34.0b3__cp39-abi3-win_amd64.whl → 1.34.0b5__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (204) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
  2. polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
  3. polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
  202. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
  204. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
@@ -1,722 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- import io
5
- from collections.abc import Sequence
6
- from pathlib import Path
7
- from typing import IO, TYPE_CHECKING, Any
8
-
9
- import polars.functions as F
10
- from polars import concat as plconcat
11
- from polars._dependencies import import_optional
12
- from polars._utils.deprecation import (
13
- deprecate_renamed_parameter,
14
- issue_deprecation_warning,
15
- )
16
- from polars._utils.unstable import issue_unstable_warning
17
- from polars._utils.various import (
18
- is_int_sequence,
19
- is_path_or_str_sequence,
20
- normalize_filepath,
21
- )
22
- from polars._utils.wrap import wrap_ldf
23
- from polars.convert import from_arrow
24
- from polars.io._utils import (
25
- prepare_file_arg,
26
- )
27
- from polars.io.cloud.credential_provider._builder import (
28
- _init_credential_provider_builder,
29
- )
30
- from polars.io.scan_options._options import ScanOptions
31
-
32
- with contextlib.suppress(ImportError):
33
- from polars._plr import PyLazyFrame
34
- from polars._plr import read_parquet_metadata as _read_parquet_metadata
35
-
36
- if TYPE_CHECKING:
37
- from typing import Literal
38
-
39
- from polars import DataFrame, DataType, LazyFrame
40
- from polars._typing import (
41
- ColumnMapping,
42
- DefaultFieldValues,
43
- DeletionFiles,
44
- FileSource,
45
- ParallelStrategy,
46
- SchemaDict,
47
- )
48
- from polars.io.cloud import CredentialProviderFunction
49
- from polars.io.scan_options import ScanCastOptions
50
-
51
-
52
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
53
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
54
- def read_parquet(
55
- source: FileSource,
56
- *,
57
- columns: list[int] | list[str] | None = None,
58
- n_rows: int | None = None,
59
- row_index_name: str | None = None,
60
- row_index_offset: int = 0,
61
- parallel: ParallelStrategy = "auto",
62
- use_statistics: bool = True,
63
- hive_partitioning: bool | None = None,
64
- glob: bool = True,
65
- schema: SchemaDict | None = None,
66
- hive_schema: SchemaDict | None = None,
67
- try_parse_hive_dates: bool = True,
68
- rechunk: bool = False,
69
- low_memory: bool = False,
70
- storage_options: dict[str, Any] | None = None,
71
- credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
72
- retries: int = 2,
73
- use_pyarrow: bool = False,
74
- pyarrow_options: dict[str, Any] | None = None,
75
- memory_map: bool = True,
76
- include_file_paths: str | None = None,
77
- missing_columns: Literal["insert", "raise"] = "raise",
78
- allow_missing_columns: bool | None = None,
79
- ) -> DataFrame:
80
- """
81
- Read into a DataFrame from a parquet file.
82
-
83
- .. versionchanged:: 0.20.4
84
- * The `row_count_name` parameter was renamed `row_index_name`.
85
- * The `row_count_offset` parameter was renamed `row_index_offset`.
86
-
87
- Parameters
88
- ----------
89
- source
90
- Path(s) to a file or directory
91
- When needing to authenticate for scanning cloud locations, see the
92
- `storage_options` parameter.
93
-
94
- File-like objects are supported (by "file-like object" we refer to objects
95
- that have a `read()` method, such as a file handler like the builtin `open`
96
- function, or a `BytesIO` instance). For file-like objects, the stream position
97
- may not be updated accordingly after reading.
98
- columns
99
- Columns to select. Accepts a list of column indices (starting at zero) or a list
100
- of column names.
101
- n_rows
102
- Stop reading from parquet file after reading `n_rows`.
103
- Only valid when `use_pyarrow=False`.
104
- row_index_name
105
- Insert a row index column with the given name into the DataFrame as the first
106
- column. If set to `None` (default), no row index column is created.
107
- row_index_offset
108
- Start the row index at this offset. Cannot be negative.
109
- Only used if `row_index_name` is set.
110
- parallel : {'auto', 'columns', 'row_groups', 'none'}
111
- This determines the direction of parallelism. 'auto' will try to determine the
112
- optimal direction.
113
- use_statistics
114
- Use statistics in the parquet to determine if pages
115
- can be skipped from reading.
116
- hive_partitioning
117
- Infer statistics and schema from Hive partitioned URL and use them
118
- to prune reads. This is unset by default (i.e. `None`), meaning it is
119
- automatically enabled when a single directory is passed, and otherwise
120
- disabled.
121
- glob
122
- Expand path given via globbing rules.
123
- schema
124
- Specify the datatypes of the columns. The datatypes must match the
125
- datatypes in the file(s). If there are extra columns that are not in the
126
- file(s), consider also passing `missing_columns='insert'`.
127
-
128
- .. warning::
129
- This functionality is considered **unstable**. It may be changed
130
- at any point without it being considered a breaking change.
131
- hive_schema
132
- The column names and data types of the columns by which the data is partitioned.
133
- If set to `None` (default), the schema of the Hive partitions is inferred.
134
-
135
- .. warning::
136
- This functionality is considered **unstable**. It may be changed
137
- at any point without it being considered a breaking change.
138
- try_parse_hive_dates
139
- Whether to try parsing hive values as date/datetime types.
140
- rechunk
141
- Make sure that all columns are contiguous in memory by
142
- aggregating the chunks into a single array.
143
- low_memory
144
- Reduce memory pressure at the expense of performance.
145
- storage_options
146
- Options that indicate how to connect to a cloud provider.
147
-
148
- The cloud providers currently supported are AWS, GCP, and Azure.
149
- See supported keys here:
150
-
151
- * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
152
- * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
153
- * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
154
- * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
155
- `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
156
-
157
- If `storage_options` is not provided, Polars will try to infer the information
158
- from environment variables.
159
- credential_provider
160
- Provide a function that can be called to provide cloud storage
161
- credentials. The function is expected to return a dictionary of
162
- credential keys along with an optional credential expiry time.
163
-
164
- .. warning::
165
- This functionality is considered **unstable**. It may be changed
166
- at any point without it being considered a breaking change.
167
- retries
168
- Number of retries if accessing a cloud instance fails.
169
- use_pyarrow
170
- Use PyArrow instead of the Rust-native Parquet reader. The PyArrow reader is
171
- more stable.
172
- pyarrow_options
173
- Keyword arguments for `pyarrow.parquet.read_table
174
- <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html>`_.
175
- memory_map
176
- Memory map underlying file. This will likely increase performance.
177
- Only used when `use_pyarrow=True`.
178
- include_file_paths
179
- Include the path of the source file(s) as a column with this name.
180
- Only valid when `use_pyarrow=False`.
181
- missing_columns
182
- Configuration for behavior when columns defined in the schema
183
- are missing from the data:
184
-
185
- * `insert`: Inserts the missing columns using NULLs as the row values.
186
- * `raise`: Raises an error.
187
-
188
- allow_missing_columns
189
- When reading a list of parquet files, if a column existing in the first
190
- file cannot be found in subsequent files, the default behavior is to
191
- raise an error. However, if `allow_missing_columns` is set to
192
- `True`, a full-NULL column is returned instead of erroring for the files
193
- that do not contain the column.
194
-
195
- .. deprecated:: 1.30.0
196
- Use the parameter `missing_columns` instead and pass one of
197
- `('insert', 'raise')`.
198
-
199
- Returns
200
- -------
201
- DataFrame
202
-
203
- See Also
204
- --------
205
- scan_parquet: Lazily read from a parquet file or multiple files via glob patterns.
206
- scan_pyarrow_dataset
207
-
208
- Warnings
209
- --------
210
- Calling `read_parquet().lazy()` is an antipattern as this forces Polars to
211
- materialize a full parquet file and therefore cannot push any optimizations
212
- into the reader. Therefore always prefer `scan_parquet` if you want to work
213
- with `LazyFrame` s.
214
-
215
- """
216
- if schema is not None:
217
- msg = "the `schema` parameter of `read_parquet` is considered unstable."
218
- issue_unstable_warning(msg)
219
-
220
- if hive_schema is not None:
221
- msg = "the `hive_schema` parameter of `read_parquet` is considered unstable."
222
- issue_unstable_warning(msg)
223
-
224
- # Dispatch to pyarrow if requested
225
- if use_pyarrow:
226
- if n_rows is not None:
227
- msg = "`n_rows` cannot be used with `use_pyarrow=True`"
228
- raise ValueError(msg)
229
- if include_file_paths is not None:
230
- msg = "`include_file_paths` cannot be used with `use_pyarrow=True`"
231
- raise ValueError(msg)
232
- if schema is not None:
233
- msg = "`schema` cannot be used with `use_pyarrow=True`"
234
- raise ValueError(msg)
235
- if hive_schema is not None:
236
- msg = (
237
- "cannot use `hive_partitions` with `use_pyarrow=True`"
238
- "\n\nHint: Pass `pyarrow_options` instead with a 'partitioning' entry."
239
- )
240
- raise TypeError(msg)
241
- return _read_parquet_with_pyarrow(
242
- source,
243
- columns=columns,
244
- storage_options=storage_options,
245
- pyarrow_options=pyarrow_options,
246
- memory_map=memory_map,
247
- rechunk=rechunk,
248
- )
249
-
250
- if allow_missing_columns is not None:
251
- issue_deprecation_warning(
252
- "the parameter `allow_missing_columns` for `read_parquet` is deprecated. "
253
- "Use the parameter `missing_columns` instead and pass one of "
254
- "`('insert', 'raise')`.",
255
- version="1.30.0",
256
- )
257
-
258
- missing_columns = "insert" if allow_missing_columns else "raise"
259
-
260
- # For other inputs, defer to `scan_parquet`
261
- lf = scan_parquet(
262
- source,
263
- n_rows=n_rows,
264
- row_index_name=row_index_name,
265
- row_index_offset=row_index_offset,
266
- parallel=parallel,
267
- use_statistics=use_statistics,
268
- hive_partitioning=hive_partitioning,
269
- schema=schema,
270
- hive_schema=hive_schema,
271
- try_parse_hive_dates=try_parse_hive_dates,
272
- rechunk=rechunk,
273
- low_memory=low_memory,
274
- cache=False,
275
- storage_options=storage_options,
276
- credential_provider=credential_provider,
277
- retries=retries,
278
- glob=glob,
279
- include_file_paths=include_file_paths,
280
- missing_columns=missing_columns,
281
- )
282
-
283
- if columns is not None:
284
- if is_int_sequence(columns):
285
- lf = lf.select(F.nth(columns))
286
- else:
287
- lf = lf.select(columns)
288
-
289
- return lf.collect()
290
-
291
-
292
- def _read_parquet_with_pyarrow(
293
- source: str
294
- | Path
295
- | IO[bytes]
296
- | bytes
297
- | list[str]
298
- | list[Path]
299
- | list[IO[bytes]]
300
- | list[bytes],
301
- *,
302
- columns: list[int] | list[str] | None = None,
303
- storage_options: dict[str, Any] | None = None,
304
- pyarrow_options: dict[str, Any] | None = None,
305
- memory_map: bool = True,
306
- rechunk: bool = True,
307
- ) -> DataFrame:
308
- pyarrow_parquet = import_optional(
309
- "pyarrow.parquet",
310
- err_prefix="",
311
- err_suffix="is required when using `read_parquet(..., use_pyarrow=True)`",
312
- )
313
- pyarrow_options = pyarrow_options or {}
314
-
315
- sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = []
316
- if isinstance(source, list):
317
- if len(source) > 0 and isinstance(source[0], (bytes, io.IOBase)):
318
- sources = source # type: ignore[assignment]
319
- else:
320
- sources = [source] # type: ignore[list-item]
321
- else:
322
- sources = [source]
323
-
324
- results: list[DataFrame] = []
325
- for source in sources:
326
- with prepare_file_arg(
327
- source, # type: ignore[arg-type]
328
- use_pyarrow=True,
329
- storage_options=storage_options,
330
- ) as source_prep:
331
- pa_table = pyarrow_parquet.read_table(
332
- source_prep,
333
- memory_map=memory_map,
334
- columns=columns,
335
- **pyarrow_options,
336
- )
337
- result = from_arrow(pa_table, rechunk=rechunk)
338
- results.append(result) # type: ignore[arg-type]
339
-
340
- if len(results) == 1:
341
- return results[0]
342
- else:
343
- return plconcat(results)
344
-
345
-
346
- def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataType]:
347
- """
348
- Get the schema of a Parquet file without reading data.
349
-
350
- If you would like to read the schema of a cloud file with authentication
351
- configuration, it is recommended use `scan_parquet` - e.g.
352
- `scan_parquet(..., storage_options=...).collect_schema()`.
353
-
354
- Parameters
355
- ----------
356
- source
357
- Path to a file or a file-like object (by "file-like object" we refer to objects
358
- that have a `read()` method, such as a file handler like the builtin `open`
359
- function, or a `BytesIO` instance). For file-like objects, the stream position
360
- may not be updated accordingly after reading.
361
-
362
- Returns
363
- -------
364
- dict
365
- Dictionary mapping column names to datatypes
366
-
367
- See Also
368
- --------
369
- scan_parquet
370
- """
371
- return scan_parquet(source).collect_schema()
372
-
373
-
374
- def read_parquet_metadata(
375
- source: str | Path | IO[bytes] | bytes,
376
- storage_options: dict[str, Any] | None = None,
377
- credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
378
- retries: int = 2,
379
- ) -> dict[str, str]:
380
- """
381
- Get file-level custom metadata of a Parquet file without reading data.
382
-
383
- .. warning::
384
- This functionality is considered **experimental**. It may be removed or
385
- changed at any point without it being considered a breaking change.
386
-
387
- Parameters
388
- ----------
389
- source
390
- Path to a file or a file-like object (by "file-like object" we refer to objects
391
- that have a `read()` method, such as a file handler like the builtin `open`
392
- function, or a `BytesIO` instance). For file-like objects, the stream position
393
- may not be updated accordingly after reading.
394
- storage_options
395
- Options that indicate how to connect to a cloud provider.
396
-
397
- The cloud providers currently supported are AWS, GCP, and Azure.
398
- See supported keys here:
399
-
400
- * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
401
- * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
402
- * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
403
- * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
404
- `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
405
-
406
- If `storage_options` is not provided, Polars will try to infer the information
407
- from environment variables.
408
- credential_provider
409
- Provide a function that can be called to provide cloud storage
410
- credentials. The function is expected to return a dictionary of
411
- credential keys along with an optional credential expiry time.
412
-
413
- .. warning::
414
- This functionality is considered **unstable**. It may be changed
415
- at any point without it being considered a breaking change.
416
- retries
417
- Number of retries if accessing a cloud instance fails.
418
-
419
- Returns
420
- -------
421
- dict
422
- Dictionary with the metadata. Empty if no custom metadata is available.
423
- """
424
- if isinstance(source, (str, Path)):
425
- source = normalize_filepath(source, check_not_directory=False)
426
-
427
- credential_provider_builder = _init_credential_provider_builder(
428
- credential_provider, source, storage_options, "scan_parquet"
429
- )
430
- del credential_provider
431
-
432
- return _read_parquet_metadata(
433
- source,
434
- storage_options=(
435
- list(storage_options.items()) if storage_options is not None else None
436
- ),
437
- credential_provider=credential_provider_builder,
438
- retries=retries,
439
- )
440
-
441
-
442
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
443
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
444
- def scan_parquet(
445
- source: FileSource,
446
- *,
447
- n_rows: int | None = None,
448
- row_index_name: str | None = None,
449
- row_index_offset: int = 0,
450
- parallel: ParallelStrategy = "auto",
451
- use_statistics: bool = True,
452
- hive_partitioning: bool | None = None,
453
- glob: bool = True,
454
- hidden_file_prefix: str | Sequence[str] | None = None,
455
- schema: SchemaDict | None = None,
456
- hive_schema: SchemaDict | None = None,
457
- try_parse_hive_dates: bool = True,
458
- rechunk: bool = False,
459
- low_memory: bool = False,
460
- cache: bool = True,
461
- storage_options: dict[str, Any] | None = None,
462
- credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
463
- retries: int = 2,
464
- include_file_paths: str | None = None,
465
- missing_columns: Literal["insert", "raise"] = "raise",
466
- allow_missing_columns: bool | None = None,
467
- extra_columns: Literal["ignore", "raise"] = "raise",
468
- cast_options: ScanCastOptions | None = None,
469
- _column_mapping: ColumnMapping | None = None,
470
- _default_values: DefaultFieldValues | None = None,
471
- _deletion_files: DeletionFiles | None = None,
472
- _table_statistics: DataFrame | None = None,
473
- ) -> LazyFrame:
474
- """
475
- Lazily read from a local or cloud-hosted parquet file (or files).
476
-
477
- This function allows the query optimizer to push down predicates and projections to
478
- the scan level, typically increasing performance and reducing memory overhead.
479
-
480
- .. versionchanged:: 0.20.4
481
- * The `row_count_name` parameter was renamed `row_index_name`.
482
- * The `row_count_offset` parameter was renamed `row_index_offset`.
483
-
484
- .. versionchanged:: 1.30.0
485
- * The `allow_missing_columns` is deprecated in favor of `missing_columns`.
486
-
487
- Parameters
488
- ----------
489
- source
490
- Path(s) to a file or directory
491
- When needing to authenticate for scanning cloud locations, see the
492
- `storage_options` parameter.
493
- n_rows
494
- Stop reading from parquet file after reading `n_rows`.
495
- row_index_name
496
- If not None, this will insert a row index column with the given name into the
497
- DataFrame
498
- row_index_offset
499
- Offset to start the row index column (only used if the name is set)
500
- parallel : {'auto', 'columns', 'row_groups', 'prefiltered', 'none'}
501
- This determines the direction and strategy of parallelism. 'auto' will
502
- try to determine the optimal direction.
503
-
504
- The `prefiltered` strategy first evaluates the pushed-down predicates in
505
- parallel and determines a mask of which rows to read. Then, it
506
- parallelizes over both the columns and the row groups while filtering
507
- out rows that do not need to be read. This can provide significant
508
- speedups for large files (i.e. many row-groups) with a predicate that
509
- filters clustered rows or filters heavily. In other cases,
510
- `prefiltered` may slow down the scan compared other strategies.
511
-
512
- The `prefiltered` settings falls back to `auto` if no predicate is
513
- given.
514
-
515
- .. warning::
516
- The `prefiltered` strategy is considered **unstable**. It may be
517
- changed at any point without it being considered a breaking change.
518
-
519
- use_statistics
520
- Use statistics in the parquet to determine if pages
521
- can be skipped from reading.
522
- hive_partitioning
523
- Infer statistics and schema from hive partitioned URL and use them
524
- to prune reads.
525
- glob
526
- Expand path given via globbing rules.
527
- hidden_file_prefix
528
- Skip reading files whose names begin with the specified prefixes.
529
-
530
- .. warning::
531
- This functionality is considered **unstable**. It may be changed
532
- at any point without it being considered a breaking change.
533
- schema
534
- Specify the datatypes of the columns. The datatypes must match the
535
- datatypes in the file(s). If there are extra columns that are not in the
536
- file(s), consider also passing `missing_columns='insert'`.
537
-
538
- .. warning::
539
- This functionality is considered **unstable**. It may be changed
540
- at any point without it being considered a breaking change.
541
- hive_schema
542
- The column names and data types of the columns by which the data is partitioned.
543
- If set to `None` (default), the schema of the Hive partitions is inferred.
544
-
545
- .. warning::
546
- This functionality is considered **unstable**. It may be changed
547
- at any point without it being considered a breaking change.
548
- try_parse_hive_dates
549
- Whether to try parsing hive values as date/datetime types.
550
- rechunk
551
- In case of reading multiple files via a glob pattern rechunk the final DataFrame
552
- into contiguous memory chunks.
553
- low_memory
554
- Reduce memory pressure at the expense of performance.
555
- cache
556
- Cache the result after reading.
557
- storage_options
558
- Options that indicate how to connect to a cloud provider.
559
-
560
- The cloud providers currently supported are AWS, GCP, and Azure.
561
- See supported keys here:
562
-
563
- * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
564
- * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
565
- * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
566
- * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
567
- `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
568
-
569
- If `storage_options` is not provided, Polars will try to infer the information
570
- from environment variables.
571
- credential_provider
572
- Provide a function that can be called to provide cloud storage
573
- credentials. The function is expected to return a dictionary of
574
- credential keys along with an optional credential expiry time.
575
-
576
- .. warning::
577
- This functionality is considered **unstable**. It may be changed
578
- at any point without it being considered a breaking change.
579
- retries
580
- Number of retries if accessing a cloud instance fails.
581
- include_file_paths
582
- Include the path of the source file(s) as a column with this name.
583
- missing_columns
584
- Configuration for behavior when columns defined in the schema
585
- are missing from the data:
586
-
587
- * `insert`: Inserts the missing columns using NULLs as the row values.
588
- * `raise`: Raises an error.
589
-
590
- allow_missing_columns
591
- When reading a list of parquet files, if a column existing in the first
592
- file cannot be found in subsequent files, the default behavior is to
593
- raise an error. However, if `allow_missing_columns` is set to
594
- `True`, a full-NULL column is returned instead of erroring for the files
595
- that do not contain the column.
596
-
597
- .. deprecated:: 1.30.0
598
- Use the parameter `missing_columns` instead and pass one of
599
- `('insert', 'raise')`.
600
- extra_columns
601
- Configuration for behavior when extra columns outside of the
602
- defined schema are encountered in the data:
603
-
604
- * `ignore`: Silently ignores.
605
- * `raise`: Raises an error.
606
-
607
- cast_options
608
- Configuration for column type-casting during scans. Useful for datasets
609
- containing files that have differing schemas.
610
-
611
- .. warning::
612
- This functionality is considered **unstable**. It may be changed
613
- at any point without it being considered a breaking change.
614
-
615
- See Also
616
- --------
617
- read_parquet
618
- scan_pyarrow_dataset
619
-
620
- Examples
621
- --------
622
- Scan a local Parquet file.
623
-
624
- >>> pl.scan_parquet("path/to/file.parquet") # doctest: +SKIP
625
-
626
- Scan a file on AWS S3.
627
-
628
- >>> source = "s3://bucket/*.parquet"
629
- >>> pl.scan_parquet(source) # doctest: +SKIP
630
- >>> storage_options = {
631
- ... "aws_access_key_id": "<secret>",
632
- ... "aws_secret_access_key": "<secret>",
633
- ... "aws_region": "us-east-1",
634
- ... }
635
- >>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP
636
- """
637
- if schema is not None:
638
- msg = "the `schema` parameter of `scan_parquet` is considered unstable."
639
- issue_unstable_warning(msg)
640
-
641
- if hive_schema is not None:
642
- msg = "the `hive_schema` parameter of `scan_parquet` is considered unstable."
643
- issue_unstable_warning(msg)
644
-
645
- if cast_options is not None:
646
- msg = "The `cast_options` parameter of `scan_parquet` is considered unstable."
647
- issue_unstable_warning(msg)
648
-
649
- if hidden_file_prefix is not None:
650
- msg = "The `hidden_file_prefix` parameter of `scan_parquet` is considered unstable."
651
- issue_unstable_warning(msg)
652
-
653
- if allow_missing_columns is not None:
654
- issue_deprecation_warning(
655
- "the parameter `allow_missing_columns` for `scan_parquet` is deprecated. "
656
- "Use the parameter `missing_columns` instead and pass one of "
657
- "`('insert', 'raise')`.",
658
- version="1.30.0",
659
- )
660
-
661
- missing_columns = "insert" if allow_missing_columns else "raise"
662
-
663
- if isinstance(source, (str, Path)):
664
- source = normalize_filepath(source, check_not_directory=False)
665
- elif is_path_or_str_sequence(source):
666
- source = [
667
- normalize_filepath(source, check_not_directory=False) for source in source
668
- ]
669
-
670
- credential_provider_builder = _init_credential_provider_builder(
671
- credential_provider, source, storage_options, "scan_parquet"
672
- )
673
-
674
- del credential_provider
675
-
676
- sources = (
677
- [source]
678
- if not isinstance(source, Sequence) or isinstance(source, (str, bytes))
679
- else source
680
- )
681
-
682
- pylf = PyLazyFrame.new_from_parquet(
683
- sources=sources,
684
- schema=schema,
685
- parallel=parallel,
686
- low_memory=low_memory,
687
- use_statistics=use_statistics,
688
- scan_options=ScanOptions(
689
- row_index=(
690
- (row_index_name, row_index_offset)
691
- if row_index_name is not None
692
- else None
693
- ),
694
- pre_slice=(0, n_rows) if n_rows is not None else None,
695
- cast_options=cast_options,
696
- extra_columns=extra_columns,
697
- missing_columns=missing_columns,
698
- include_file_paths=include_file_paths,
699
- glob=glob,
700
- hidden_file_prefix=(
701
- [hidden_file_prefix]
702
- if isinstance(hidden_file_prefix, str)
703
- else hidden_file_prefix
704
- ),
705
- hive_partitioning=hive_partitioning,
706
- hive_schema=hive_schema,
707
- try_parse_hive_dates=try_parse_hive_dates,
708
- rechunk=rechunk,
709
- cache=cache,
710
- storage_options=(
711
- list(storage_options.items()) if storage_options is not None else None
712
- ),
713
- credential_provider=credential_provider_builder,
714
- retries=retries,
715
- column_mapping=_column_mapping,
716
- default_values=_default_values,
717
- deletion_files=_deletion_files,
718
- table_statistics=_table_statistics,
719
- ),
720
- )
721
-
722
- return wrap_ldf(pylf)