polars-runtime-compat 1.34.0b2__cp39-abi3-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/.gitkeep +0 -0
  2. _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
  3. polars/__init__.py +528 -0
  4. polars/_cpu_check.py +265 -0
  5. polars/_dependencies.py +355 -0
  6. polars/_plr.py +99 -0
  7. polars/_plr.pyi +2496 -0
  8. polars/_reexport.py +23 -0
  9. polars/_typing.py +478 -0
  10. polars/_utils/__init__.py +37 -0
  11. polars/_utils/async_.py +102 -0
  12. polars/_utils/cache.py +176 -0
  13. polars/_utils/cloud.py +40 -0
  14. polars/_utils/constants.py +29 -0
  15. polars/_utils/construction/__init__.py +46 -0
  16. polars/_utils/construction/dataframe.py +1397 -0
  17. polars/_utils/construction/other.py +72 -0
  18. polars/_utils/construction/series.py +560 -0
  19. polars/_utils/construction/utils.py +118 -0
  20. polars/_utils/convert.py +224 -0
  21. polars/_utils/deprecation.py +406 -0
  22. polars/_utils/getitem.py +457 -0
  23. polars/_utils/logging.py +11 -0
  24. polars/_utils/nest_asyncio.py +264 -0
  25. polars/_utils/parquet.py +15 -0
  26. polars/_utils/parse/__init__.py +12 -0
  27. polars/_utils/parse/expr.py +242 -0
  28. polars/_utils/polars_version.py +19 -0
  29. polars/_utils/pycapsule.py +53 -0
  30. polars/_utils/scan.py +27 -0
  31. polars/_utils/serde.py +63 -0
  32. polars/_utils/slice.py +215 -0
  33. polars/_utils/udfs.py +1251 -0
  34. polars/_utils/unstable.py +63 -0
  35. polars/_utils/various.py +782 -0
  36. polars/_utils/wrap.py +25 -0
  37. polars/api.py +370 -0
  38. polars/catalog/__init__.py +0 -0
  39. polars/catalog/unity/__init__.py +19 -0
  40. polars/catalog/unity/client.py +733 -0
  41. polars/catalog/unity/models.py +152 -0
  42. polars/config.py +1571 -0
  43. polars/convert/__init__.py +25 -0
  44. polars/convert/general.py +1046 -0
  45. polars/convert/normalize.py +261 -0
  46. polars/dataframe/__init__.py +5 -0
  47. polars/dataframe/_html.py +186 -0
  48. polars/dataframe/frame.py +12582 -0
  49. polars/dataframe/group_by.py +1067 -0
  50. polars/dataframe/plotting.py +257 -0
  51. polars/datatype_expr/__init__.py +5 -0
  52. polars/datatype_expr/array.py +56 -0
  53. polars/datatype_expr/datatype_expr.py +304 -0
  54. polars/datatype_expr/list.py +18 -0
  55. polars/datatype_expr/struct.py +69 -0
  56. polars/datatypes/__init__.py +122 -0
  57. polars/datatypes/_parse.py +195 -0
  58. polars/datatypes/_utils.py +48 -0
  59. polars/datatypes/classes.py +1213 -0
  60. polars/datatypes/constants.py +11 -0
  61. polars/datatypes/constructor.py +172 -0
  62. polars/datatypes/convert.py +366 -0
  63. polars/datatypes/group.py +130 -0
  64. polars/exceptions.py +230 -0
  65. polars/expr/__init__.py +7 -0
  66. polars/expr/array.py +964 -0
  67. polars/expr/binary.py +346 -0
  68. polars/expr/categorical.py +306 -0
  69. polars/expr/datetime.py +2620 -0
  70. polars/expr/expr.py +11272 -0
  71. polars/expr/list.py +1408 -0
  72. polars/expr/meta.py +444 -0
  73. polars/expr/name.py +321 -0
  74. polars/expr/string.py +3045 -0
  75. polars/expr/struct.py +357 -0
  76. polars/expr/whenthen.py +185 -0
  77. polars/functions/__init__.py +193 -0
  78. polars/functions/aggregation/__init__.py +33 -0
  79. polars/functions/aggregation/horizontal.py +298 -0
  80. polars/functions/aggregation/vertical.py +341 -0
  81. polars/functions/as_datatype.py +848 -0
  82. polars/functions/business.py +138 -0
  83. polars/functions/col.py +384 -0
  84. polars/functions/datatype.py +121 -0
  85. polars/functions/eager.py +524 -0
  86. polars/functions/escape_regex.py +29 -0
  87. polars/functions/lazy.py +2751 -0
  88. polars/functions/len.py +68 -0
  89. polars/functions/lit.py +210 -0
  90. polars/functions/random.py +22 -0
  91. polars/functions/range/__init__.py +19 -0
  92. polars/functions/range/_utils.py +15 -0
  93. polars/functions/range/date_range.py +303 -0
  94. polars/functions/range/datetime_range.py +370 -0
  95. polars/functions/range/int_range.py +348 -0
  96. polars/functions/range/linear_space.py +311 -0
  97. polars/functions/range/time_range.py +287 -0
  98. polars/functions/repeat.py +301 -0
  99. polars/functions/whenthen.py +353 -0
  100. polars/interchange/__init__.py +10 -0
  101. polars/interchange/buffer.py +77 -0
  102. polars/interchange/column.py +190 -0
  103. polars/interchange/dataframe.py +230 -0
  104. polars/interchange/from_dataframe.py +328 -0
  105. polars/interchange/protocol.py +303 -0
  106. polars/interchange/utils.py +170 -0
  107. polars/io/__init__.py +64 -0
  108. polars/io/_utils.py +317 -0
  109. polars/io/avro.py +49 -0
  110. polars/io/clipboard.py +36 -0
  111. polars/io/cloud/__init__.py +17 -0
  112. polars/io/cloud/_utils.py +80 -0
  113. polars/io/cloud/credential_provider/__init__.py +17 -0
  114. polars/io/cloud/credential_provider/_builder.py +520 -0
  115. polars/io/cloud/credential_provider/_providers.py +618 -0
  116. polars/io/csv/__init__.py +9 -0
  117. polars/io/csv/_utils.py +38 -0
  118. polars/io/csv/batched_reader.py +142 -0
  119. polars/io/csv/functions.py +1495 -0
  120. polars/io/database/__init__.py +6 -0
  121. polars/io/database/_arrow_registry.py +70 -0
  122. polars/io/database/_cursor_proxies.py +147 -0
  123. polars/io/database/_executor.py +578 -0
  124. polars/io/database/_inference.py +314 -0
  125. polars/io/database/_utils.py +144 -0
  126. polars/io/database/functions.py +516 -0
  127. polars/io/delta.py +499 -0
  128. polars/io/iceberg/__init__.py +3 -0
  129. polars/io/iceberg/_utils.py +697 -0
  130. polars/io/iceberg/dataset.py +556 -0
  131. polars/io/iceberg/functions.py +151 -0
  132. polars/io/ipc/__init__.py +8 -0
  133. polars/io/ipc/functions.py +514 -0
  134. polars/io/json/__init__.py +3 -0
  135. polars/io/json/read.py +101 -0
  136. polars/io/ndjson.py +332 -0
  137. polars/io/parquet/__init__.py +17 -0
  138. polars/io/parquet/field_overwrites.py +140 -0
  139. polars/io/parquet/functions.py +722 -0
  140. polars/io/partition.py +491 -0
  141. polars/io/plugins.py +187 -0
  142. polars/io/pyarrow_dataset/__init__.py +5 -0
  143. polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
  144. polars/io/pyarrow_dataset/functions.py +79 -0
  145. polars/io/scan_options/__init__.py +5 -0
  146. polars/io/scan_options/_options.py +59 -0
  147. polars/io/scan_options/cast_options.py +126 -0
  148. polars/io/spreadsheet/__init__.py +6 -0
  149. polars/io/spreadsheet/_utils.py +52 -0
  150. polars/io/spreadsheet/_write_utils.py +647 -0
  151. polars/io/spreadsheet/functions.py +1323 -0
  152. polars/lazyframe/__init__.py +9 -0
  153. polars/lazyframe/engine_config.py +61 -0
  154. polars/lazyframe/frame.py +8564 -0
  155. polars/lazyframe/group_by.py +669 -0
  156. polars/lazyframe/in_process.py +42 -0
  157. polars/lazyframe/opt_flags.py +333 -0
  158. polars/meta/__init__.py +14 -0
  159. polars/meta/build.py +33 -0
  160. polars/meta/index_type.py +27 -0
  161. polars/meta/thread_pool.py +50 -0
  162. polars/meta/versions.py +120 -0
  163. polars/ml/__init__.py +0 -0
  164. polars/ml/torch.py +213 -0
  165. polars/ml/utilities.py +30 -0
  166. polars/plugins.py +155 -0
  167. polars/py.typed +0 -0
  168. polars/pyproject.toml +96 -0
  169. polars/schema.py +265 -0
  170. polars/selectors.py +3117 -0
  171. polars/series/__init__.py +5 -0
  172. polars/series/array.py +776 -0
  173. polars/series/binary.py +254 -0
  174. polars/series/categorical.py +246 -0
  175. polars/series/datetime.py +2275 -0
  176. polars/series/list.py +1087 -0
  177. polars/series/plotting.py +191 -0
  178. polars/series/series.py +9197 -0
  179. polars/series/string.py +2367 -0
  180. polars/series/struct.py +154 -0
  181. polars/series/utils.py +191 -0
  182. polars/sql/__init__.py +7 -0
  183. polars/sql/context.py +677 -0
  184. polars/sql/functions.py +139 -0
  185. polars/string_cache.py +185 -0
  186. polars/testing/__init__.py +13 -0
  187. polars/testing/asserts/__init__.py +9 -0
  188. polars/testing/asserts/frame.py +231 -0
  189. polars/testing/asserts/series.py +219 -0
  190. polars/testing/asserts/utils.py +12 -0
  191. polars/testing/parametric/__init__.py +33 -0
  192. polars/testing/parametric/profiles.py +107 -0
  193. polars/testing/parametric/strategies/__init__.py +22 -0
  194. polars/testing/parametric/strategies/_utils.py +14 -0
  195. polars/testing/parametric/strategies/core.py +615 -0
  196. polars/testing/parametric/strategies/data.py +452 -0
  197. polars/testing/parametric/strategies/dtype.py +436 -0
  198. polars/testing/parametric/strategies/legacy.py +169 -0
  199. polars/type_aliases.py +24 -0
  200. polars_runtime_compat-1.34.0b2.dist-info/METADATA +31 -0
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
  202. polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
  203. polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +1 -0
polars/io/partition.py ADDED
@@ -0,0 +1,491 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from collections.abc import Iterable, Mapping, Sequence
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ from polars import DataFrame, col
9
+ from polars._typing import PartitioningScheme
10
+ from polars._utils.unstable import issue_unstable_warning
11
+ from polars.expr import Expr
12
+
13
+ if TYPE_CHECKING:
14
+ with contextlib.suppress(ImportError): # Module not available when building docs
15
+ from polars._plr import PyDataFrame, PyExpr
16
+
17
+ from typing import IO, Any, Callable
18
+
19
+ with contextlib.suppress(ImportError): # Module not available when building docs
20
+ from polars._plr import PyPartitioning
21
+
22
+
23
+ class KeyedPartition:
24
+ """
25
+ A key-value pair for a partition.
26
+
27
+ .. warning::
28
+ This functionality is currently considered **unstable**. It may be
29
+ changed at any point without it being considered a breaking change.
30
+
31
+ See Also
32
+ --------
33
+ PartitionByKey
34
+ PartitionParted
35
+ KeyedPartitionContext
36
+ """
37
+
38
+ def __init__(self, name: str, str_value: str, raw_value: Any) -> None:
39
+ self.name = name
40
+ self.str_value = str_value
41
+ self.raw_value = raw_value
42
+
43
+ name: str #: Name of the key column.
44
+ str_value: str #: Value of the key as a path and URL safe string.
45
+ raw_value: Any #: Value of the key for this partition.
46
+
47
+ def hive_name(self) -> str:
48
+ """Get the `key=value`."""
49
+ return f"{self.name}={self.str_value}"
50
+
51
+
52
+ class KeyedPartitionContext:
53
+ """
54
+ Callback context for a partition creation using keys.
55
+
56
+ .. warning::
57
+ This functionality is currently considered **unstable**. It may be
58
+ changed at any point without it being considered a breaking change.
59
+
60
+ See Also
61
+ --------
62
+ PartitionByKey
63
+ PartitionParted
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ file_idx: int,
69
+ part_idx: int,
70
+ in_part_idx: int,
71
+ keys: list[KeyedPartition],
72
+ file_path: Path,
73
+ full_path: Path,
74
+ ) -> None:
75
+ self.file_idx = file_idx
76
+ self.part_idx = part_idx
77
+ self.in_part_idx = in_part_idx
78
+ self.keys = keys
79
+ self.file_path = file_path
80
+ self.full_path = full_path
81
+
82
+ file_idx: int #: The index of the created file starting from zero.
83
+ part_idx: int #: The index of the created partition starting from zero.
84
+ in_part_idx: int #: The index of the file within this partition starting from zero.
85
+ keys: list[KeyedPartition] #: All the key names and values used for this partition.
86
+ file_path: Path #: The chosen output path before the callback was called without `base_path`.
87
+ full_path: (
88
+ Path #: The chosen output path before the callback was called with `base_path`.
89
+ )
90
+
91
+ def hive_dirs(self) -> Path:
92
+ """The keys mapped to hive directories."""
93
+ assert len(self.keys) > 0
94
+ p = Path(self.keys[0].hive_name())
95
+ for key in self.keys[1:]:
96
+ p /= Path(key.hive_name())
97
+ return p
98
+
99
+
100
+ class BasePartitionContext:
101
+ """
102
+ Callback context for a partition creation.
103
+
104
+ .. warning::
105
+ This functionality is currently considered **unstable**. It may be
106
+ changed at any point without it being considered a breaking change.
107
+
108
+ See Also
109
+ --------
110
+ PartitionMaxSize
111
+ """
112
+
113
+ def __init__(self, file_idx: int, file_path: Path, full_path: Path) -> None:
114
+ self.file_idx = file_idx
115
+ self.file_path = file_path
116
+ self.full_path = full_path
117
+
118
+ file_idx: int #: The index of the created file starting from zero.
119
+ file_path: Path #: The chosen output path before the callback was called without `base_path`.
120
+ full_path: (
121
+ Path #: The chosen output path before the callback was called with `base_path`.
122
+ )
123
+
124
+
125
+ def _cast_base_file_path_cb(
126
+ file_path_cb: Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]]
127
+ | None,
128
+ ) -> Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]] | None:
129
+ if file_path_cb is None:
130
+ return None
131
+ return lambda ctx: file_path_cb(
132
+ BasePartitionContext(
133
+ file_idx=ctx.file_idx,
134
+ file_path=Path(ctx.file_path),
135
+ full_path=Path(ctx.full_path),
136
+ )
137
+ )
138
+
139
+
140
+ def _cast_keyed_file_path_cb(
141
+ file_path_cb: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
142
+ | None,
143
+ ) -> Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]] | None:
144
+ if file_path_cb is None:
145
+ return None
146
+ return lambda ctx: file_path_cb(
147
+ KeyedPartitionContext(
148
+ file_idx=ctx.file_idx,
149
+ part_idx=ctx.part_idx,
150
+ in_part_idx=ctx.in_part_idx,
151
+ keys=[
152
+ KeyedPartition(
153
+ name=kv.name, str_value=kv.str_value, raw_value=kv.raw_value
154
+ )
155
+ for kv in ctx.keys
156
+ ],
157
+ file_path=Path(ctx.file_path),
158
+ full_path=Path(ctx.full_path),
159
+ )
160
+ )
161
+
162
+
163
+ def _prepare_per_partition_sort_by(
164
+ e: str | Expr | Iterable[str | Expr] | None,
165
+ ) -> list[PyExpr] | None:
166
+ def prepare_one(v: str | Expr) -> PyExpr:
167
+ if isinstance(v, str):
168
+ return col(v)._pyexpr
169
+ elif isinstance(v, Expr):
170
+ return v._pyexpr
171
+ else:
172
+ msg = f"cannot do a per partition sort by for {v!r}"
173
+ raise TypeError(msg)
174
+
175
+ if e is None:
176
+ return None
177
+ elif isinstance(e, str):
178
+ return [col(e)._pyexpr]
179
+ elif isinstance(e, Expr):
180
+ return [e._pyexpr]
181
+ elif isinstance(e, Iterable):
182
+ return [prepare_one(v) for v in e]
183
+ else:
184
+ msg = f"cannot do a per partition sort by for {e!r}"
185
+ raise TypeError(msg)
186
+
187
+
188
+ def _prepare_finish_callback(
189
+ f: Callable[[DataFrame], None] | None,
190
+ ) -> Callable[[PyDataFrame], None] | None:
191
+ if f is None:
192
+ return None
193
+
194
+ def cb(pydf: PyDataFrame) -> None:
195
+ nonlocal f
196
+ f(DataFrame._from_pydf(pydf))
197
+
198
+ return cb
199
+
200
+
201
+ class PartitionMaxSize(PartitioningScheme):
202
+ """
203
+ Partitioning scheme to write files with a maximum size.
204
+
205
+ This partitioning scheme generates files that have a given maximum size. If
206
+ the size reaches the maximum size, it is closed and a new file is opened.
207
+
208
+ .. warning::
209
+ This functionality is currently considered **unstable**. It may be
210
+ changed at any point without it being considered a breaking change.
211
+
212
+ Parameters
213
+ ----------
214
+ base_path
215
+ The base path for the output files.
216
+ file_path
217
+ A callback to register or modify the output path for each partition
218
+ relative to the `base_path`. The callback provides a
219
+ :class:`polars.io.partition.BasePartitionContext` that contains information
220
+ about the partition.
221
+
222
+ If no callback is given, it defaults to `{ctx.file_idx}.{EXT}`.
223
+ max_size : int
224
+ The maximum size in rows of each of the generated files.
225
+ per_partition_sort_by
226
+ Columns or expressions to sort over within each partition.
227
+
228
+ Note that this might increase the memory consumption needed for each partition.
229
+ finish_callback
230
+ A callback that gets called when the query finishes successfully.
231
+
232
+ For parquet files, the callback is given a dataframe with metrics about all
233
+ files written files.
234
+
235
+ Examples
236
+ --------
237
+ Split a parquet file by over smaller CSV files with 100 000 rows each:
238
+
239
+ >>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
240
+ ... PartitionMax("./out", max_size=100_000),
241
+ ... ) # doctest: +SKIP
242
+
243
+ See Also
244
+ --------
245
+ PartitionByKey
246
+ PartitionParted
247
+ polars.io.partition.BasePartitionContext
248
+ """
249
+
250
+ def __init__(
251
+ self,
252
+ base_path: str | Path,
253
+ *,
254
+ file_path: Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]]
255
+ | None = None,
256
+ max_size: int,
257
+ per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
258
+ finish_callback: Callable[[DataFrame], None] | None = None,
259
+ ) -> None:
260
+ issue_unstable_warning("partitioning strategies are considered unstable.")
261
+ super().__init__(
262
+ PyPartitioning.new_max_size(
263
+ base_path=base_path,
264
+ file_path_cb=_cast_base_file_path_cb(file_path),
265
+ max_size=max_size,
266
+ per_partition_sort_by=_prepare_per_partition_sort_by(
267
+ per_partition_sort_by
268
+ ),
269
+ finish_callback=_prepare_finish_callback(finish_callback),
270
+ )
271
+ )
272
+
273
+
274
+ def _lower_by(
275
+ by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
276
+ ) -> list[PyExpr]:
277
+ def to_expr(i: str | Expr) -> Expr:
278
+ if isinstance(i, str):
279
+ return col(i)
280
+ else:
281
+ return i
282
+
283
+ lowered_by: list[PyExpr]
284
+ if isinstance(by, str):
285
+ lowered_by = [col(by)._pyexpr]
286
+ elif isinstance(by, Expr):
287
+ lowered_by = [by._pyexpr]
288
+ elif isinstance(by, Sequence):
289
+ lowered_by = [to_expr(e)._pyexpr for e in by]
290
+ elif isinstance(by, Mapping):
291
+ lowered_by = [e.alias(n)._pyexpr for n, e in by.items()]
292
+ else:
293
+ msg = "invalid `by` type"
294
+ raise TypeError(msg)
295
+
296
+ return lowered_by
297
+
298
+
299
+ class PartitionByKey(PartitioningScheme):
300
+ """
301
+ Partitioning scheme to write files split by the values of keys.
302
+
303
+ This partitioning scheme generates an arbitrary amount of files splitting
304
+ the data depending on what the value is of key expressions.
305
+
306
+ The amount of files that can be written is not limited. However, when
307
+ writing beyond a certain amount of files, the data for the remaining
308
+ partitions is buffered before writing to the file.
309
+
310
+ .. warning::
311
+ This functionality is currently considered **unstable**. It may be
312
+ changed at any point without it being considered a breaking change.
313
+
314
+ Parameters
315
+ ----------
316
+ base_path
317
+ The base path for the output files.
318
+
319
+ Use the `mkdir` option on the `sink_*` methods to ensure directories in
320
+ the path are created.
321
+ file_path
322
+ A callback to register or modify the output path for each partition
323
+ relative to the `base_path`. The callback provides a
324
+ :class:`polars.io.partition.KeyedPartitionContext` that contains information
325
+ about the partition.
326
+
327
+ If no callback is given, it defaults to
328
+ `{ctx.keys.hive_dirs()}/{ctx.in_part_idx}.{EXT}`.
329
+ by
330
+ The expressions to partition by.
331
+ include_key : bool
332
+ Whether to include the key columns in the output files.
333
+ per_partition_sort_by
334
+ Columns or expressions to sort over within each partition.
335
+
336
+ Note that this might increase the memory consumption needed for each partition.
337
+ finish_callback
338
+ A callback that gets called when the query finishes successfully.
339
+
340
+ For parquet files, the callback is given a dataframe with metrics about all
341
+ files written files.
342
+
343
+ Examples
344
+ --------
345
+ Split into a hive-partitioning style partition:
346
+
347
+ >>> (
348
+ ... pl.DataFrame({"a": [1, 2, 3], "b": [5, 7, 9], "c": ["A", "B", "C"]})
349
+ ... .lazy()
350
+ ... .sink_parquet(
351
+ ... PartitionByKey(
352
+ ... "./out",
353
+ ... by=[pl.col.a, pl.col.b],
354
+ ... include_key=False,
355
+ ... ),
356
+ ... mkdir=True,
357
+ ... )
358
+ ... ) # doctest: +SKIP
359
+
360
+ Split a parquet file by a column `year` into CSV files:
361
+
362
+ >>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
363
+ ... PartitionByKey(
364
+ ... "./out/",
365
+ ... file_path=lambda ctx: f"year={ctx.keys[0].str_value}.csv",
366
+ ... by="year",
367
+ ... ),
368
+ ... ) # doctest: +SKIP
369
+
370
+ See Also
371
+ --------
372
+ PartitionMaxSize
373
+ PartitionParted
374
+ polars.io.partition.KeyedPartitionContext
375
+ """
376
+
377
+ def __init__(
378
+ self,
379
+ base_path: str | Path,
380
+ *,
381
+ file_path: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
382
+ | None = None,
383
+ by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
384
+ include_key: bool = True,
385
+ per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
386
+ finish_callback: Callable[[DataFrame], None] | None = None,
387
+ ) -> None:
388
+ issue_unstable_warning("partitioning strategies are considered unstable.")
389
+
390
+ lowered_by = _lower_by(by)
391
+ super().__init__(
392
+ PyPartitioning.new_by_key(
393
+ base_path=base_path,
394
+ file_path_cb=_cast_keyed_file_path_cb(file_path),
395
+ by=lowered_by,
396
+ include_key=include_key,
397
+ per_partition_sort_by=_prepare_per_partition_sort_by(
398
+ per_partition_sort_by
399
+ ),
400
+ finish_callback=_prepare_finish_callback(finish_callback),
401
+ )
402
+ )
403
+
404
+
405
+ class PartitionParted(PartitioningScheme):
406
+ """
407
+ Partitioning scheme to split parted dataframes.
408
+
409
+ This is a specialized version of :class:`PartitionByKey`. Where as
410
+ :class:`PartitionByKey` accepts data in any order, this scheme expects the input
411
+ data to be pre-grouped or pre-sorted. This scheme suffers a lot less overhead than
412
+ :class:`PartitionByKey`, but may not be always applicable.
413
+
414
+ Each new value of the key expressions starts a new partition, therefore repeating
415
+ the same value multiple times may overwrite previous partitions.
416
+
417
+ .. warning::
418
+ This functionality is currently considered **unstable**. It may be
419
+ changed at any point without it being considered a breaking change.
420
+
421
+ Parameters
422
+ ----------
423
+ base_path
424
+ The base path for the output files.
425
+
426
+ Use the `mkdir` option on the `sink_*` methods to ensure directories in
427
+ the path are created.
428
+ file_path
429
+ A callback to register or modify the output path for each partition
430
+ relative to the `base_path`.The callback provides a
431
+ :class:`polars.io.partition.KeyedPartitionContext` that contains information
432
+ about the partition.
433
+
434
+ If no callback is given, it defaults to
435
+ `{ctx.keys.hive_dirs()}/{ctx.in_part_idx}.{EXT}`.
436
+ by
437
+ The expressions to partition by.
438
+ include_key : bool
439
+ Whether to include the key columns in the output files.
440
+ per_partition_sort_by
441
+ Columns or expressions to sort over within each partition.
442
+
443
+ Note that this might increase the memory consumption needed for each partition.
444
+ finish_callback
445
+ A callback that gets called when the query finishes successfully.
446
+
447
+ For parquet files, the callback is given a dataframe with metrics about all
448
+ files written files.
449
+
450
+ Examples
451
+ --------
452
+ Split a parquet file by a column `year` into CSV files:
453
+
454
+ >>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
455
+ ... PartitionParted("./out", by="year"),
456
+ ... mkdir=True,
457
+ ... ) # doctest: +SKIP
458
+
459
+ See Also
460
+ --------
461
+ PartitionMaxSize
462
+ PartitionByKey
463
+ polars.io.partition.KeyedPartitionContext
464
+ """
465
+
466
+ def __init__(
467
+ self,
468
+ base_path: str | Path,
469
+ *,
470
+ file_path: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
471
+ | None = None,
472
+ by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
473
+ include_key: bool = True,
474
+ per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
475
+ finish_callback: Callable[[DataFrame], None] | None = None,
476
+ ) -> None:
477
+ issue_unstable_warning("partitioning strategies are considered unstable.")
478
+
479
+ lowered_by = _lower_by(by)
480
+ super().__init__(
481
+ PyPartitioning.new_by_key(
482
+ base_path=base_path,
483
+ file_path_cb=_cast_keyed_file_path_cb(file_path),
484
+ by=lowered_by,
485
+ include_key=include_key,
486
+ per_partition_sort_by=_prepare_per_partition_sort_by(
487
+ per_partition_sort_by
488
+ ),
489
+ finish_callback=_prepare_finish_callback(finish_callback),
490
+ )
491
+ )
polars/io/plugins.py ADDED
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from collections.abc import Iterator
6
+ from typing import TYPE_CHECKING, Callable
7
+
8
+ import polars._reexport as pl
9
+ from polars._utils.unstable import unstable
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+ from typing import Callable
14
+
15
+ from polars import DataFrame, Expr, LazyFrame
16
+ from polars._typing import SchemaDict
17
+
18
+
19
+ @unstable()
20
+ def register_io_source(
21
+ io_source: Callable[
22
+ [list[str] | None, Expr | None, int | None, int | None], Iterator[DataFrame]
23
+ ],
24
+ *,
25
+ schema: Callable[[], SchemaDict] | SchemaDict,
26
+ validate_schema: bool = False,
27
+ is_pure: bool = False,
28
+ ) -> LazyFrame:
29
+ """
30
+ Register your IO plugin and initialize a LazyFrame.
31
+
32
+ See the `user guide <https://docs.pola.rs/user-guide/plugins/io_plugins>`_
33
+ for more information about plugins.
34
+
35
+ .. warning::
36
+ This functionality is considered **unstable**. It may be changed
37
+ at any point without it being considered a breaking change.
38
+
39
+
40
+ Parameters
41
+ ----------
42
+ io_source
43
+ Function that accepts the following arguments:
44
+ with_columns
45
+ Columns that are projected. The reader must
46
+ project these columns if applied
47
+ predicate
48
+ Polars expression. The reader must filter
49
+ their rows accordingly.
50
+ n_rows
51
+ Materialize only n rows from the source.
52
+ The reader can stop when `n_rows` are read.
53
+ batch_size
54
+ A hint of the ideal batch size the reader's
55
+ generator must produce.
56
+
57
+ The function should return a an iterator/generator
58
+ that produces DataFrames.
59
+ schema
60
+ Schema or function that when called produces the schema that the reader
61
+ will produce before projection pushdown.
62
+ validate_schema
63
+ Whether the engine should validate if the batches generated match
64
+ the given schema. It's an implementation error if this isn't
65
+ the case and can lead to bugs that are hard to solve.
66
+ is_pure
67
+ Whether the IO source is pure. Repeated occurrences of same IO source in
68
+ a LazyFrame plan can be de-duplicated during optimization if they are
69
+ pure.
70
+
71
+ Returns
72
+ -------
73
+ LazyFrame
74
+ """
75
+
76
+ def wrap(
77
+ with_columns: list[str] | None,
78
+ predicate: bytes | None,
79
+ n_rows: int | None,
80
+ batch_size: int | None,
81
+ ) -> tuple[Iterator[DataFrame], bool]:
82
+ parsed_predicate_success = True
83
+ parsed_predicate = None
84
+ if predicate:
85
+ try:
86
+ parsed_predicate = pl.Expr.deserialize(predicate)
87
+ except Exception as e:
88
+ if os.environ.get("POLARS_VERBOSE"):
89
+ print(
90
+ f"failed parsing IO plugin expression\n\nfilter will be handled on Polars' side: {e}",
91
+ file=sys.stderr,
92
+ )
93
+ parsed_predicate_success = False
94
+
95
+ return io_source(
96
+ with_columns, parsed_predicate, n_rows, batch_size
97
+ ), parsed_predicate_success
98
+
99
+ return pl.LazyFrame._scan_python_function(
100
+ schema=schema,
101
+ scan_fn=wrap,
102
+ pyarrow=False,
103
+ validate_schema=validate_schema,
104
+ is_pure=is_pure,
105
+ )
106
+
107
+
108
+ @unstable()
109
+ def _defer(
110
+ function: Callable[[], DataFrame],
111
+ *,
112
+ schema: SchemaDict | Callable[[], SchemaDict],
113
+ validate_schema: bool = True,
114
+ ) -> LazyFrame:
115
+ """
116
+ Deferred execution.
117
+
118
+ Takes a function that produces a `DataFrame` but defers execution until the
119
+ `LazyFrame` is collected.
120
+
121
+ Parameters
122
+ ----------
123
+ function
124
+ Function that takes no arguments and produces a `DataFrame`.
125
+ schema
126
+ Schema of the `DataFrame` the deferred function will return.
127
+ The caller must ensure this schema is correct.
128
+ validate_schema
129
+ Whether the engine should validate if the batches generated match
130
+ the given schema. It's an implementation error if this isn't
131
+ the case and can lead to bugs that are hard to solve.
132
+
133
+ Examples
134
+ --------
135
+ Delay DataFrame execution until query is executed.
136
+
137
+ >>> import numpy as np
138
+ >>> np.random.seed(0)
139
+ >>> lf = pl.defer(
140
+ ... lambda: pl.DataFrame({"a": np.random.randn(3)}), schema={"a": pl.Float64}
141
+ ... )
142
+ >>> lf.collect()
143
+ shape: (3, 1)
144
+ ┌──────────┐
145
+ │ a │
146
+ │ --- │
147
+ │ f64 │
148
+ ╞══════════╡
149
+ │ 1.764052 │
150
+ │ 0.400157 │
151
+ │ 0.978738 │
152
+ └──────────┘
153
+
154
+ Run an eager source in Polars Cloud
155
+
156
+ >>> (
157
+ ... pl.defer(
158
+ ... lambda: pl.read_database("select * from tbl"),
159
+ ... schema={"a": pl.Float64, "b": pl.Boolean},
160
+ ... )
161
+ ... .filter("b")
162
+ ... .sum("a")
163
+ ... .remote()
164
+ ... .collect()
165
+ ... ) # doctest: +SKIP
166
+
167
+
168
+ """
169
+
170
+ def source(
171
+ with_columns: list[str] | None,
172
+ predicate: Expr | None,
173
+ n_rows: int | None,
174
+ batch_size: int | None,
175
+ ) -> Iterator[DataFrame]:
176
+ lf = function().lazy()
177
+ if with_columns is not None:
178
+ lf = lf.select(with_columns)
179
+ if predicate is not None:
180
+ lf = lf.filter(predicate)
181
+ if n_rows is not None:
182
+ lf = lf.limit(n_rows)
183
+ yield lf.collect()
184
+
185
+ return register_io_source(
186
+ io_source=source, schema=schema, validate_schema=validate_schema
187
+ )
@@ -0,0 +1,5 @@
1
+ from polars.io.pyarrow_dataset.functions import scan_pyarrow_dataset
2
+
3
+ __all__ = [
4
+ "scan_pyarrow_dataset",
5
+ ]