polars-runtime-compat 1.34.0b2__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/.gitkeep +0 -0
  2. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  3. polars/__init__.py +528 -0
  4. polars/_cpu_check.py +265 -0
  5. polars/_dependencies.py +355 -0
  6. polars/_plr.py +99 -0
  7. polars/_plr.pyi +2496 -0
  8. polars/_reexport.py +23 -0
  9. polars/_typing.py +478 -0
  10. polars/_utils/__init__.py +37 -0
  11. polars/_utils/async_.py +102 -0
  12. polars/_utils/cache.py +176 -0
  13. polars/_utils/cloud.py +40 -0
  14. polars/_utils/constants.py +29 -0
  15. polars/_utils/construction/__init__.py +46 -0
  16. polars/_utils/construction/dataframe.py +1397 -0
  17. polars/_utils/construction/other.py +72 -0
  18. polars/_utils/construction/series.py +560 -0
  19. polars/_utils/construction/utils.py +118 -0
  20. polars/_utils/convert.py +224 -0
  21. polars/_utils/deprecation.py +406 -0
  22. polars/_utils/getitem.py +457 -0
  23. polars/_utils/logging.py +11 -0
  24. polars/_utils/nest_asyncio.py +264 -0
  25. polars/_utils/parquet.py +15 -0
  26. polars/_utils/parse/__init__.py +12 -0
  27. polars/_utils/parse/expr.py +242 -0
  28. polars/_utils/polars_version.py +19 -0
  29. polars/_utils/pycapsule.py +53 -0
  30. polars/_utils/scan.py +27 -0
  31. polars/_utils/serde.py +63 -0
  32. polars/_utils/slice.py +215 -0
  33. polars/_utils/udfs.py +1251 -0
  34. polars/_utils/unstable.py +63 -0
  35. polars/_utils/various.py +782 -0
  36. polars/_utils/wrap.py +25 -0
  37. polars/api.py +370 -0
  38. polars/catalog/__init__.py +0 -0
  39. polars/catalog/unity/__init__.py +19 -0
  40. polars/catalog/unity/client.py +733 -0
  41. polars/catalog/unity/models.py +152 -0
  42. polars/config.py +1571 -0
  43. polars/convert/__init__.py +25 -0
  44. polars/convert/general.py +1046 -0
  45. polars/convert/normalize.py +261 -0
  46. polars/dataframe/__init__.py +5 -0
  47. polars/dataframe/_html.py +186 -0
  48. polars/dataframe/frame.py +12582 -0
  49. polars/dataframe/group_by.py +1067 -0
  50. polars/dataframe/plotting.py +257 -0
  51. polars/datatype_expr/__init__.py +5 -0
  52. polars/datatype_expr/array.py +56 -0
  53. polars/datatype_expr/datatype_expr.py +304 -0
  54. polars/datatype_expr/list.py +18 -0
  55. polars/datatype_expr/struct.py +69 -0
  56. polars/datatypes/__init__.py +122 -0
  57. polars/datatypes/_parse.py +195 -0
  58. polars/datatypes/_utils.py +48 -0
  59. polars/datatypes/classes.py +1213 -0
  60. polars/datatypes/constants.py +11 -0
  61. polars/datatypes/constructor.py +172 -0
  62. polars/datatypes/convert.py +366 -0
  63. polars/datatypes/group.py +130 -0
  64. polars/exceptions.py +230 -0
  65. polars/expr/__init__.py +7 -0
  66. polars/expr/array.py +964 -0
  67. polars/expr/binary.py +346 -0
  68. polars/expr/categorical.py +306 -0
  69. polars/expr/datetime.py +2620 -0
  70. polars/expr/expr.py +11272 -0
  71. polars/expr/list.py +1408 -0
  72. polars/expr/meta.py +444 -0
  73. polars/expr/name.py +321 -0
  74. polars/expr/string.py +3045 -0
  75. polars/expr/struct.py +357 -0
  76. polars/expr/whenthen.py +185 -0
  77. polars/functions/__init__.py +193 -0
  78. polars/functions/aggregation/__init__.py +33 -0
  79. polars/functions/aggregation/horizontal.py +298 -0
  80. polars/functions/aggregation/vertical.py +341 -0
  81. polars/functions/as_datatype.py +848 -0
  82. polars/functions/business.py +138 -0
  83. polars/functions/col.py +384 -0
  84. polars/functions/datatype.py +121 -0
  85. polars/functions/eager.py +524 -0
  86. polars/functions/escape_regex.py +29 -0
  87. polars/functions/lazy.py +2751 -0
  88. polars/functions/len.py +68 -0
  89. polars/functions/lit.py +210 -0
  90. polars/functions/random.py +22 -0
  91. polars/functions/range/__init__.py +19 -0
  92. polars/functions/range/_utils.py +15 -0
  93. polars/functions/range/date_range.py +303 -0
  94. polars/functions/range/datetime_range.py +370 -0
  95. polars/functions/range/int_range.py +348 -0
  96. polars/functions/range/linear_space.py +311 -0
  97. polars/functions/range/time_range.py +287 -0
  98. polars/functions/repeat.py +301 -0
  99. polars/functions/whenthen.py +353 -0
  100. polars/interchange/__init__.py +10 -0
  101. polars/interchange/buffer.py +77 -0
  102. polars/interchange/column.py +190 -0
  103. polars/interchange/dataframe.py +230 -0
  104. polars/interchange/from_dataframe.py +328 -0
  105. polars/interchange/protocol.py +303 -0
  106. polars/interchange/utils.py +170 -0
  107. polars/io/__init__.py +64 -0
  108. polars/io/_utils.py +317 -0
  109. polars/io/avro.py +49 -0
  110. polars/io/clipboard.py +36 -0
  111. polars/io/cloud/__init__.py +17 -0
  112. polars/io/cloud/_utils.py +80 -0
  113. polars/io/cloud/credential_provider/__init__.py +17 -0
  114. polars/io/cloud/credential_provider/_builder.py +520 -0
  115. polars/io/cloud/credential_provider/_providers.py +618 -0
  116. polars/io/csv/__init__.py +9 -0
  117. polars/io/csv/_utils.py +38 -0
  118. polars/io/csv/batched_reader.py +142 -0
  119. polars/io/csv/functions.py +1495 -0
  120. polars/io/database/__init__.py +6 -0
  121. polars/io/database/_arrow_registry.py +70 -0
  122. polars/io/database/_cursor_proxies.py +147 -0
  123. polars/io/database/_executor.py +578 -0
  124. polars/io/database/_inference.py +314 -0
  125. polars/io/database/_utils.py +144 -0
  126. polars/io/database/functions.py +516 -0
  127. polars/io/delta.py +499 -0
  128. polars/io/iceberg/__init__.py +3 -0
  129. polars/io/iceberg/_utils.py +697 -0
  130. polars/io/iceberg/dataset.py +556 -0
  131. polars/io/iceberg/functions.py +151 -0
  132. polars/io/ipc/__init__.py +8 -0
  133. polars/io/ipc/functions.py +514 -0
  134. polars/io/json/__init__.py +3 -0
  135. polars/io/json/read.py +101 -0
  136. polars/io/ndjson.py +332 -0
  137. polars/io/parquet/__init__.py +17 -0
  138. polars/io/parquet/field_overwrites.py +140 -0
  139. polars/io/parquet/functions.py +722 -0
  140. polars/io/partition.py +491 -0
  141. polars/io/plugins.py +187 -0
  142. polars/io/pyarrow_dataset/__init__.py +5 -0
  143. polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
  144. polars/io/pyarrow_dataset/functions.py +79 -0
  145. polars/io/scan_options/__init__.py +5 -0
  146. polars/io/scan_options/_options.py +59 -0
  147. polars/io/scan_options/cast_options.py +126 -0
  148. polars/io/spreadsheet/__init__.py +6 -0
  149. polars/io/spreadsheet/_utils.py +52 -0
  150. polars/io/spreadsheet/_write_utils.py +647 -0
  151. polars/io/spreadsheet/functions.py +1323 -0
  152. polars/lazyframe/__init__.py +9 -0
  153. polars/lazyframe/engine_config.py +61 -0
  154. polars/lazyframe/frame.py +8564 -0
  155. polars/lazyframe/group_by.py +669 -0
  156. polars/lazyframe/in_process.py +42 -0
  157. polars/lazyframe/opt_flags.py +333 -0
  158. polars/meta/__init__.py +14 -0
  159. polars/meta/build.py +33 -0
  160. polars/meta/index_type.py +27 -0
  161. polars/meta/thread_pool.py +50 -0
  162. polars/meta/versions.py +120 -0
  163. polars/ml/__init__.py +0 -0
  164. polars/ml/torch.py +213 -0
  165. polars/ml/utilities.py +30 -0
  166. polars/plugins.py +155 -0
  167. polars/py.typed +0 -0
  168. polars/pyproject.toml +96 -0
  169. polars/schema.py +265 -0
  170. polars/selectors.py +3117 -0
  171. polars/series/__init__.py +5 -0
  172. polars/series/array.py +776 -0
  173. polars/series/binary.py +254 -0
  174. polars/series/categorical.py +246 -0
  175. polars/series/datetime.py +2275 -0
  176. polars/series/list.py +1087 -0
  177. polars/series/plotting.py +191 -0
  178. polars/series/series.py +9197 -0
  179. polars/series/string.py +2367 -0
  180. polars/series/struct.py +154 -0
  181. polars/series/utils.py +191 -0
  182. polars/sql/__init__.py +7 -0
  183. polars/sql/context.py +677 -0
  184. polars/sql/functions.py +139 -0
  185. polars/string_cache.py +185 -0
  186. polars/testing/__init__.py +13 -0
  187. polars/testing/asserts/__init__.py +9 -0
  188. polars/testing/asserts/frame.py +231 -0
  189. polars/testing/asserts/series.py +219 -0
  190. polars/testing/asserts/utils.py +12 -0
  191. polars/testing/parametric/__init__.py +33 -0
  192. polars/testing/parametric/profiles.py +107 -0
  193. polars/testing/parametric/strategies/__init__.py +22 -0
  194. polars/testing/parametric/strategies/_utils.py +14 -0
  195. polars/testing/parametric/strategies/core.py +615 -0
  196. polars/testing/parametric/strategies/data.py +452 -0
  197. polars/testing/parametric/strategies/dtype.py +436 -0
  198. polars/testing/parametric/strategies/legacy.py +169 -0
  199. polars/type_aliases.py +24 -0
  200. polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
  202. polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
  203. polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
polars/io/ndjson.py ADDED
@@ -0,0 +1,332 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from pathlib import Path
5
+ from typing import IO, TYPE_CHECKING, Any, Literal
6
+
7
+ from polars._utils.deprecation import deprecate_renamed_parameter
8
+ from polars._utils.various import is_path_or_str_sequence, normalize_filepath
9
+ from polars._utils.wrap import wrap_ldf
10
+ from polars.datatypes import N_INFER_DEFAULT
11
+ from polars.io._utils import parse_row_index_args
12
+ from polars.io.cloud.credential_provider._builder import (
13
+ _init_credential_provider_builder,
14
+ )
15
+
16
+ with contextlib.suppress(ImportError): # Module not available when building docs
17
+ from polars._plr import PyLazyFrame
18
+
19
+ if TYPE_CHECKING:
20
+ from polars import DataFrame, LazyFrame
21
+ from polars._typing import SchemaDefinition
22
+ from polars.io.cloud import CredentialProviderFunction
23
+
24
+
25
+ def read_ndjson(
26
+ source: str
27
+ | Path
28
+ | IO[str]
29
+ | IO[bytes]
30
+ | bytes
31
+ | list[str]
32
+ | list[Path]
33
+ | list[IO[str]]
34
+ | list[IO[bytes]],
35
+ *,
36
+ schema: SchemaDefinition | None = None,
37
+ schema_overrides: SchemaDefinition | None = None,
38
+ infer_schema_length: int | None = N_INFER_DEFAULT,
39
+ batch_size: int | None = 1024,
40
+ n_rows: int | None = None,
41
+ low_memory: bool = False,
42
+ rechunk: bool = False,
43
+ row_index_name: str | None = None,
44
+ row_index_offset: int = 0,
45
+ ignore_errors: bool = False,
46
+ storage_options: dict[str, Any] | None = None,
47
+ credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
48
+ retries: int = 2,
49
+ file_cache_ttl: int | None = None,
50
+ include_file_paths: str | None = None,
51
+ ) -> DataFrame:
52
+ r"""
53
+ Read into a DataFrame from a newline delimited JSON file.
54
+
55
+ Parameters
56
+ ----------
57
+ source
58
+ Path to a file or a file-like object (by "file-like object" we refer to objects
59
+ that have a `read()` method, such as a file handler like the builtin `open`
60
+ function, or a `BytesIO` instance). For file-like objects, the stream position
61
+ may not be updated accordingly after reading.
62
+ schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
63
+ The DataFrame schema may be declared in several ways:
64
+
65
+ * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
66
+ * As a list of column names; in this case types are automatically inferred.
67
+ * As a list of (name,type) pairs; this is equivalent to the dictionary form.
68
+
69
+ If you supply a list of column names that does not match the names in the
70
+ underlying data, the names given here will overwrite them. The number
71
+ of names given in the schema should match the underlying data dimensions.
72
+ schema_overrides : dict, default None
73
+ Support type specification or override of one or more columns; note that
74
+ any dtypes inferred from the schema param will be overridden.
75
+ infer_schema_length
76
+ The maximum number of rows to scan for schema inference.
77
+ If set to `None`, the full data may be scanned *(this is slow)*.
78
+ batch_size
79
+ Number of rows to read in each batch.
80
+ n_rows
81
+ Stop reading from JSON file after reading `n_rows`.
82
+ low_memory
83
+ Reduce memory pressure at the expense of performance.
84
+ rechunk
85
+ Reallocate to contiguous memory when all chunks/ files are parsed.
86
+ row_index_name
87
+ If not None, this will insert a row index column with give name into the
88
+ DataFrame
89
+ row_index_offset
90
+ Offset to start the row index column (only use if the name is set)
91
+ ignore_errors
92
+ Return `Null` if parsing fails because of schema mismatches.
93
+ storage_options
94
+ Options that indicate how to connect to a cloud provider.
95
+
96
+ The cloud providers currently supported are AWS, GCP, and Azure.
97
+ See supported keys here:
98
+
99
+ * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
100
+ * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
101
+ * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
102
+ * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
103
+ `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
104
+
105
+ If `storage_options` is not provided, Polars will try to infer the information
106
+ from environment variables.
107
+ credential_provider
108
+ Provide a function that can be called to provide cloud storage
109
+ credentials. The function is expected to return a dictionary of
110
+ credential keys along with an optional credential expiry time.
111
+
112
+ .. warning::
113
+ This functionality is considered **unstable**. It may be changed
114
+ at any point without it being considered a breaking change.
115
+ retries
116
+ Number of retries if accessing a cloud instance fails.
117
+ file_cache_ttl
118
+ Amount of time to keep downloaded cloud files since their last access time,
119
+ in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
120
+ (which defaults to 1 hour) if not given.
121
+ include_file_paths
122
+ Include the path of the source file(s) as a column with this name.
123
+
124
+ See Also
125
+ --------
126
+ scan_ndjson : Lazily read from an NDJSON file or multiple files via glob patterns.
127
+
128
+ Warnings
129
+ --------
130
+ Calling `read_ndjson().lazy()` is an antipattern as this forces Polars to
131
+ materialize a full ndjson file and therefore cannot push any optimizations into
132
+ the reader. Therefore always prefer `scan_ndjson` if you want to work with
133
+ `LazyFrame` s.
134
+
135
+ Examples
136
+ --------
137
+ >>> from io import StringIO
138
+ >>> json_str = '{"foo":1,"bar":6}\n{"foo":2,"bar":7}\n{"foo":3,"bar":8}\n'
139
+ >>> pl.read_ndjson(StringIO(json_str))
140
+ shape: (3, 2)
141
+ ┌─────┬─────┐
142
+ │ foo ┆ bar │
143
+ │ --- ┆ --- │
144
+ │ i64 ┆ i64 │
145
+ ╞═════╪═════╡
146
+ │ 1 ┆ 6 │
147
+ │ 2 ┆ 7 │
148
+ │ 3 ┆ 8 │
149
+ └─────┴─────┘
150
+ """
151
+ credential_provider_builder = _init_credential_provider_builder(
152
+ credential_provider, source, storage_options, "read_ndjson"
153
+ )
154
+
155
+ del credential_provider
156
+
157
+ return scan_ndjson(
158
+ source,
159
+ schema=schema,
160
+ schema_overrides=schema_overrides,
161
+ infer_schema_length=infer_schema_length,
162
+ batch_size=batch_size,
163
+ n_rows=n_rows,
164
+ low_memory=low_memory,
165
+ rechunk=rechunk,
166
+ row_index_name=row_index_name,
167
+ row_index_offset=row_index_offset,
168
+ ignore_errors=ignore_errors,
169
+ include_file_paths=include_file_paths,
170
+ retries=retries,
171
+ storage_options=storage_options,
172
+ credential_provider=credential_provider_builder, # type: ignore[arg-type]
173
+ file_cache_ttl=file_cache_ttl,
174
+ ).collect()
175
+
176
+
177
+ @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
178
+ @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
179
+ def scan_ndjson(
180
+ source: (
181
+ str
182
+ | Path
183
+ | IO[str]
184
+ | IO[bytes]
185
+ | bytes
186
+ | list[str]
187
+ | list[Path]
188
+ | list[IO[str]]
189
+ | list[IO[bytes]]
190
+ ),
191
+ *,
192
+ schema: SchemaDefinition | None = None,
193
+ schema_overrides: SchemaDefinition | None = None,
194
+ infer_schema_length: int | None = N_INFER_DEFAULT,
195
+ batch_size: int | None = 1024,
196
+ n_rows: int | None = None,
197
+ low_memory: bool = False,
198
+ rechunk: bool = False,
199
+ row_index_name: str | None = None,
200
+ row_index_offset: int = 0,
201
+ ignore_errors: bool = False,
202
+ storage_options: dict[str, Any] | None = None,
203
+ credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
204
+ retries: int = 2,
205
+ file_cache_ttl: int | None = None,
206
+ include_file_paths: str | None = None,
207
+ ) -> LazyFrame:
208
+ """
209
+ Lazily read from a newline delimited JSON file or multiple files via glob patterns.
210
+
211
+ This allows the query optimizer to push down predicates and projections to the scan
212
+ level, thereby potentially reducing memory overhead.
213
+
214
+ .. versionchanged:: 0.20.4
215
+ * The `row_count_name` parameter was renamed `row_index_name`.
216
+ * The `row_count_offset` parameter was renamed `row_index_offset`.
217
+
218
+ Parameters
219
+ ----------
220
+ source
221
+ Path to a file.
222
+ schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
223
+ The DataFrame schema may be declared in several ways:
224
+
225
+ * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
226
+ * As a list of column names; in this case types are automatically inferred.
227
+ * As a list of (name,type) pairs; this is equivalent to the dictionary form.
228
+
229
+ If you supply a list of column names that does not match the names in the
230
+ underlying data, the names given here will overwrite them. The number
231
+ of names given in the schema should match the underlying data dimensions.
232
+ schema_overrides : dict, default None
233
+ Support type specification or override of one or more columns; note that
234
+ any dtypes inferred from the schema param will be overridden.
235
+ infer_schema_length
236
+ The maximum number of rows to scan for schema inference.
237
+ If set to `None`, the full data may be scanned *(this is slow)*.
238
+ batch_size
239
+ Number of rows to read in each batch.
240
+ n_rows
241
+ Stop reading from JSON file after reading `n_rows`.
242
+ low_memory
243
+ Reduce memory pressure at the expense of performance.
244
+ rechunk
245
+ Reallocate to contiguous memory when all chunks/ files are parsed.
246
+ row_index_name
247
+ If not None, this will insert a row index column with give name into the
248
+ DataFrame
249
+ row_index_offset
250
+ Offset to start the row index column (only use if the name is set)
251
+ ignore_errors
252
+ Return `Null` if parsing fails because of schema mismatches.
253
+ storage_options
254
+ Options that indicate how to connect to a cloud provider.
255
+
256
+ The cloud providers currently supported are AWS, GCP, and Azure.
257
+ See supported keys here:
258
+
259
+ * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
260
+ * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
261
+ * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
262
+ * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
263
+ `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
264
+
265
+ If `storage_options` is not provided, Polars will try to infer the information
266
+ from environment variables.
267
+ credential_provider
268
+ Provide a function that can be called to provide cloud storage
269
+ credentials. The function is expected to return a dictionary of
270
+ credential keys along with an optional credential expiry time.
271
+
272
+ .. warning::
273
+ This functionality is considered **unstable**. It may be changed
274
+ at any point without it being considered a breaking change.
275
+ retries
276
+ Number of retries if accessing a cloud instance fails.
277
+ file_cache_ttl
278
+ Amount of time to keep downloaded cloud files since their last access time,
279
+ in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
280
+ (which defaults to 1 hour) if not given.
281
+ include_file_paths
282
+ Include the path of the source file(s) as a column with this name.
283
+ """
284
+ sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = []
285
+ if isinstance(source, (str, Path)):
286
+ source = normalize_filepath(source, check_not_directory=False)
287
+ elif isinstance(source, list):
288
+ if is_path_or_str_sequence(source):
289
+ sources = [
290
+ normalize_filepath(source, check_not_directory=False)
291
+ for source in source
292
+ ]
293
+ else:
294
+ sources = source
295
+
296
+ source = None # type: ignore[assignment]
297
+
298
+ if infer_schema_length == 0:
299
+ msg = "'infer_schema_length' should be positive"
300
+ raise ValueError(msg)
301
+
302
+ credential_provider_builder = _init_credential_provider_builder(
303
+ credential_provider, source, storage_options, "scan_ndjson"
304
+ )
305
+
306
+ del credential_provider
307
+
308
+ if storage_options:
309
+ storage_options = list(storage_options.items()) # type: ignore[assignment]
310
+ else:
311
+ # Handle empty dict input
312
+ storage_options = None
313
+
314
+ pylf = PyLazyFrame.new_from_ndjson(
315
+ source,
316
+ sources,
317
+ infer_schema_length=infer_schema_length,
318
+ schema=schema,
319
+ schema_overrides=schema_overrides,
320
+ batch_size=batch_size,
321
+ n_rows=n_rows,
322
+ low_memory=low_memory,
323
+ rechunk=rechunk,
324
+ row_index=parse_row_index_args(row_index_name, row_index_offset),
325
+ ignore_errors=ignore_errors,
326
+ include_file_paths=include_file_paths,
327
+ retries=retries,
328
+ cloud_options=storage_options,
329
+ credential_provider=credential_provider_builder,
330
+ file_cache_ttl=file_cache_ttl,
331
+ )
332
+ return wrap_ldf(pylf)
@@ -0,0 +1,17 @@
1
+ from polars.io.parquet.field_overwrites import (
2
+ ParquetFieldOverwrites,
3
+ )
4
+ from polars.io.parquet.functions import (
5
+ read_parquet,
6
+ read_parquet_metadata,
7
+ read_parquet_schema,
8
+ scan_parquet,
9
+ )
10
+
11
+ __all__ = [
12
+ "ParquetFieldOverwrites",
13
+ "read_parquet",
14
+ "read_parquet_metadata",
15
+ "read_parquet_schema",
16
+ "scan_parquet",
17
+ ]
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping, Sequence
4
+ from typing import Any
5
+
6
+
7
+ def _parquet_field_overwrites_dict_to_dict_list(
8
+ pqo: dict[str, ParquetFieldOverwrites],
9
+ ) -> list[dict[str, Any]]:
10
+ children = []
11
+ for name, child in pqo.items():
12
+ if child.name is not None:
13
+ msg = "ParquetFieldOverwrites has both a name in the dictionary and in the overwrites"
14
+ raise ValueError(msg)
15
+ child.name = name
16
+ children.append(_parquet_field_overwrites_to_dict(child))
17
+ return children
18
+
19
+
20
+ def _parquet_field_overwrites_to_dict(pqo: ParquetFieldOverwrites) -> dict[str, Any]:
21
+ d: dict[str, Any] = {}
22
+
23
+ # Name
24
+ if pqo.name is not None:
25
+ d["name"] = pqo.name
26
+
27
+ # Children
28
+ if pqo.children is not None:
29
+ if isinstance(pqo.children, ParquetFieldOverwrites):
30
+ d["children"] = _parquet_field_overwrites_to_dict(pqo.children)
31
+ elif isinstance(pqo.children, dict):
32
+ d["children"] = _parquet_field_overwrites_dict_to_dict_list(pqo.children)
33
+ elif isinstance(pqo.children, list):
34
+ d["children"] = [_parquet_field_overwrites_to_dict(c) for c in pqo.children]
35
+ else:
36
+ msg = "invalid ParquetFieldOverwrites children type"
37
+ raise TypeError(msg)
38
+
39
+ if pqo.field_id is not None:
40
+ d["field_id"] = pqo.field_id
41
+
42
+ # Metadata
43
+ if pqo.metadata is not None:
44
+ d["metadata"] = list(pqo.metadata.items())
45
+
46
+ if pqo.required is not None:
47
+ d["required"] = pqo.required
48
+
49
+ return d
50
+
51
+
52
+ class ParquetFieldOverwrites:
53
+ """
54
+ Write-option overwrites for individual Parquet fields.
55
+
56
+ .. warning::
57
+ This functionality is considered **unstable**. It may be changed
58
+ at any point without it being considered a breaking change.
59
+
60
+
61
+ Examples
62
+ --------
63
+ >>> lf = pl.LazyFrame(
64
+ ... {
65
+ ... "a": [None, 2, 3, 4],
66
+ ... "b": [[1, 2, 3], [42], [13], [37]],
67
+ ... "c": [
68
+ ... {"x": "a", "y": 42},
69
+ ... {"x": "b", "y": 13},
70
+ ... {"x": "X", "y": 37},
71
+ ... {"x": "Y", "y": 15},
72
+ ... ],
73
+ ... }
74
+ ... ) # doctest: +SKIP
75
+ >>> lf.sink_parquet(
76
+ ... "./out/parquet",
77
+ ... field_overwrites={
78
+ ... "a": ParquetFieldOverwrites(metadata={"flat_from_polars": "yes"}),
79
+ ... "b": ParquetFieldOverwrites(
80
+ ... children=ParquetFieldOverwrites(metadata={"listitem": "yes"}),
81
+ ... metadata={"list": "true"},
82
+ ... ),
83
+ ... "c": ParquetFieldOverwrites(
84
+ ... children=[
85
+ ... ParquetFieldOverwrites(name="x", metadata={"md": "yes"}),
86
+ ... ParquetFieldOverwrites(name="y", metadata={"md2": "Yes!"}),
87
+ ... ],
88
+ ... metadata={"struct": "true"},
89
+ ... ),
90
+ ... },
91
+ ... ) # doctest: +SKIP
92
+ """
93
+
94
+ name: None | str #: Name of the column or field
95
+ children: (
96
+ None
97
+ | ParquetFieldOverwrites
98
+ | list[ParquetFieldOverwrites]
99
+ | dict[str, ParquetFieldOverwrites]
100
+ ) #: Children of the column or field.
101
+ #
102
+ # For flat types (e.g. `Int32`), this should be `None`. For lists, this can be a
103
+ # unnamed `ParquetFieldOverwrites`. For structs, this can be a dict or list of named
104
+ # overwrites.
105
+
106
+ field_id: int | None = None #: The field ID used in the Parquet schema
107
+ metadata: (
108
+ dict[str, None | str] | None
109
+ ) #: Arrow metadata added to the field before writing
110
+ required: bool | None = None #: Is the field not allowed to have missing values
111
+
112
+ def __init__(
113
+ self,
114
+ *,
115
+ name: str | None = None,
116
+ children: (
117
+ None
118
+ | ParquetFieldOverwrites
119
+ | Sequence[ParquetFieldOverwrites]
120
+ | Mapping[str, ParquetFieldOverwrites]
121
+ ) = None,
122
+ field_id: int | None = None,
123
+ metadata: Mapping[str, None | str] | None = None,
124
+ required: bool | None = None,
125
+ ) -> None:
126
+ self.name = name
127
+
128
+ if isinstance(children, Mapping):
129
+ self.children = dict(children)
130
+ elif isinstance(children, Sequence):
131
+ self.children = list(children)
132
+ else:
133
+ self.children = children
134
+
135
+ self.field_id = field_id
136
+ if isinstance(metadata, Mapping):
137
+ self.metadata = dict(metadata)
138
+ else:
139
+ self.metadata = metadata
140
+ self.required = required