polars-runtime-compat 1.34.0b3__cp39-abi3-win_arm64.whl → 1.34.0b5__cp39-abi3-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/METADATA +6 -2
- polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
polars/io/partition.py
DELETED
|
@@ -1,491 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import contextlib
|
|
4
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
|
|
8
|
-
from polars import DataFrame, col
|
|
9
|
-
from polars._typing import PartitioningScheme
|
|
10
|
-
from polars._utils.unstable import issue_unstable_warning
|
|
11
|
-
from polars.expr import Expr
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
15
|
-
from polars._plr import PyDataFrame, PyExpr
|
|
16
|
-
|
|
17
|
-
from typing import IO, Any, Callable
|
|
18
|
-
|
|
19
|
-
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
20
|
-
from polars._plr import PyPartitioning
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class KeyedPartition:
|
|
24
|
-
"""
|
|
25
|
-
A key-value pair for a partition.
|
|
26
|
-
|
|
27
|
-
.. warning::
|
|
28
|
-
This functionality is currently considered **unstable**. It may be
|
|
29
|
-
changed at any point without it being considered a breaking change.
|
|
30
|
-
|
|
31
|
-
See Also
|
|
32
|
-
--------
|
|
33
|
-
PartitionByKey
|
|
34
|
-
PartitionParted
|
|
35
|
-
KeyedPartitionContext
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
def __init__(self, name: str, str_value: str, raw_value: Any) -> None:
|
|
39
|
-
self.name = name
|
|
40
|
-
self.str_value = str_value
|
|
41
|
-
self.raw_value = raw_value
|
|
42
|
-
|
|
43
|
-
name: str #: Name of the key column.
|
|
44
|
-
str_value: str #: Value of the key as a path and URL safe string.
|
|
45
|
-
raw_value: Any #: Value of the key for this partition.
|
|
46
|
-
|
|
47
|
-
def hive_name(self) -> str:
|
|
48
|
-
"""Get the `key=value`."""
|
|
49
|
-
return f"{self.name}={self.str_value}"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class KeyedPartitionContext:
|
|
53
|
-
"""
|
|
54
|
-
Callback context for a partition creation using keys.
|
|
55
|
-
|
|
56
|
-
.. warning::
|
|
57
|
-
This functionality is currently considered **unstable**. It may be
|
|
58
|
-
changed at any point without it being considered a breaking change.
|
|
59
|
-
|
|
60
|
-
See Also
|
|
61
|
-
--------
|
|
62
|
-
PartitionByKey
|
|
63
|
-
PartitionParted
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
def __init__(
|
|
67
|
-
self,
|
|
68
|
-
file_idx: int,
|
|
69
|
-
part_idx: int,
|
|
70
|
-
in_part_idx: int,
|
|
71
|
-
keys: list[KeyedPartition],
|
|
72
|
-
file_path: Path,
|
|
73
|
-
full_path: Path,
|
|
74
|
-
) -> None:
|
|
75
|
-
self.file_idx = file_idx
|
|
76
|
-
self.part_idx = part_idx
|
|
77
|
-
self.in_part_idx = in_part_idx
|
|
78
|
-
self.keys = keys
|
|
79
|
-
self.file_path = file_path
|
|
80
|
-
self.full_path = full_path
|
|
81
|
-
|
|
82
|
-
file_idx: int #: The index of the created file starting from zero.
|
|
83
|
-
part_idx: int #: The index of the created partition starting from zero.
|
|
84
|
-
in_part_idx: int #: The index of the file within this partition starting from zero.
|
|
85
|
-
keys: list[KeyedPartition] #: All the key names and values used for this partition.
|
|
86
|
-
file_path: Path #: The chosen output path before the callback was called without `base_path`.
|
|
87
|
-
full_path: (
|
|
88
|
-
Path #: The chosen output path before the callback was called with `base_path`.
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
def hive_dirs(self) -> Path:
|
|
92
|
-
"""The keys mapped to hive directories."""
|
|
93
|
-
assert len(self.keys) > 0
|
|
94
|
-
p = Path(self.keys[0].hive_name())
|
|
95
|
-
for key in self.keys[1:]:
|
|
96
|
-
p /= Path(key.hive_name())
|
|
97
|
-
return p
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class BasePartitionContext:
|
|
101
|
-
"""
|
|
102
|
-
Callback context for a partition creation.
|
|
103
|
-
|
|
104
|
-
.. warning::
|
|
105
|
-
This functionality is currently considered **unstable**. It may be
|
|
106
|
-
changed at any point without it being considered a breaking change.
|
|
107
|
-
|
|
108
|
-
See Also
|
|
109
|
-
--------
|
|
110
|
-
PartitionMaxSize
|
|
111
|
-
"""
|
|
112
|
-
|
|
113
|
-
def __init__(self, file_idx: int, file_path: Path, full_path: Path) -> None:
|
|
114
|
-
self.file_idx = file_idx
|
|
115
|
-
self.file_path = file_path
|
|
116
|
-
self.full_path = full_path
|
|
117
|
-
|
|
118
|
-
file_idx: int #: The index of the created file starting from zero.
|
|
119
|
-
file_path: Path #: The chosen output path before the callback was called without `base_path`.
|
|
120
|
-
full_path: (
|
|
121
|
-
Path #: The chosen output path before the callback was called with `base_path`.
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def _cast_base_file_path_cb(
|
|
126
|
-
file_path_cb: Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]]
|
|
127
|
-
| None,
|
|
128
|
-
) -> Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]] | None:
|
|
129
|
-
if file_path_cb is None:
|
|
130
|
-
return None
|
|
131
|
-
return lambda ctx: file_path_cb(
|
|
132
|
-
BasePartitionContext(
|
|
133
|
-
file_idx=ctx.file_idx,
|
|
134
|
-
file_path=Path(ctx.file_path),
|
|
135
|
-
full_path=Path(ctx.full_path),
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def _cast_keyed_file_path_cb(
|
|
141
|
-
file_path_cb: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
|
|
142
|
-
| None,
|
|
143
|
-
) -> Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]] | None:
|
|
144
|
-
if file_path_cb is None:
|
|
145
|
-
return None
|
|
146
|
-
return lambda ctx: file_path_cb(
|
|
147
|
-
KeyedPartitionContext(
|
|
148
|
-
file_idx=ctx.file_idx,
|
|
149
|
-
part_idx=ctx.part_idx,
|
|
150
|
-
in_part_idx=ctx.in_part_idx,
|
|
151
|
-
keys=[
|
|
152
|
-
KeyedPartition(
|
|
153
|
-
name=kv.name, str_value=kv.str_value, raw_value=kv.raw_value
|
|
154
|
-
)
|
|
155
|
-
for kv in ctx.keys
|
|
156
|
-
],
|
|
157
|
-
file_path=Path(ctx.file_path),
|
|
158
|
-
full_path=Path(ctx.full_path),
|
|
159
|
-
)
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def _prepare_per_partition_sort_by(
|
|
164
|
-
e: str | Expr | Iterable[str | Expr] | None,
|
|
165
|
-
) -> list[PyExpr] | None:
|
|
166
|
-
def prepare_one(v: str | Expr) -> PyExpr:
|
|
167
|
-
if isinstance(v, str):
|
|
168
|
-
return col(v)._pyexpr
|
|
169
|
-
elif isinstance(v, Expr):
|
|
170
|
-
return v._pyexpr
|
|
171
|
-
else:
|
|
172
|
-
msg = f"cannot do a per partition sort by for {v!r}"
|
|
173
|
-
raise TypeError(msg)
|
|
174
|
-
|
|
175
|
-
if e is None:
|
|
176
|
-
return None
|
|
177
|
-
elif isinstance(e, str):
|
|
178
|
-
return [col(e)._pyexpr]
|
|
179
|
-
elif isinstance(e, Expr):
|
|
180
|
-
return [e._pyexpr]
|
|
181
|
-
elif isinstance(e, Iterable):
|
|
182
|
-
return [prepare_one(v) for v in e]
|
|
183
|
-
else:
|
|
184
|
-
msg = f"cannot do a per partition sort by for {e!r}"
|
|
185
|
-
raise TypeError(msg)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def _prepare_finish_callback(
|
|
189
|
-
f: Callable[[DataFrame], None] | None,
|
|
190
|
-
) -> Callable[[PyDataFrame], None] | None:
|
|
191
|
-
if f is None:
|
|
192
|
-
return None
|
|
193
|
-
|
|
194
|
-
def cb(pydf: PyDataFrame) -> None:
|
|
195
|
-
nonlocal f
|
|
196
|
-
f(DataFrame._from_pydf(pydf))
|
|
197
|
-
|
|
198
|
-
return cb
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
class PartitionMaxSize(PartitioningScheme):
|
|
202
|
-
"""
|
|
203
|
-
Partitioning scheme to write files with a maximum size.
|
|
204
|
-
|
|
205
|
-
This partitioning scheme generates files that have a given maximum size. If
|
|
206
|
-
the size reaches the maximum size, it is closed and a new file is opened.
|
|
207
|
-
|
|
208
|
-
.. warning::
|
|
209
|
-
This functionality is currently considered **unstable**. It may be
|
|
210
|
-
changed at any point without it being considered a breaking change.
|
|
211
|
-
|
|
212
|
-
Parameters
|
|
213
|
-
----------
|
|
214
|
-
base_path
|
|
215
|
-
The base path for the output files.
|
|
216
|
-
file_path
|
|
217
|
-
A callback to register or modify the output path for each partition
|
|
218
|
-
relative to the `base_path`. The callback provides a
|
|
219
|
-
:class:`polars.io.partition.BasePartitionContext` that contains information
|
|
220
|
-
about the partition.
|
|
221
|
-
|
|
222
|
-
If no callback is given, it defaults to `{ctx.file_idx}.{EXT}`.
|
|
223
|
-
max_size : int
|
|
224
|
-
The maximum size in rows of each of the generated files.
|
|
225
|
-
per_partition_sort_by
|
|
226
|
-
Columns or expressions to sort over within each partition.
|
|
227
|
-
|
|
228
|
-
Note that this might increase the memory consumption needed for each partition.
|
|
229
|
-
finish_callback
|
|
230
|
-
A callback that gets called when the query finishes successfully.
|
|
231
|
-
|
|
232
|
-
For parquet files, the callback is given a dataframe with metrics about all
|
|
233
|
-
files written files.
|
|
234
|
-
|
|
235
|
-
Examples
|
|
236
|
-
--------
|
|
237
|
-
Split a parquet file by over smaller CSV files with 100 000 rows each:
|
|
238
|
-
|
|
239
|
-
>>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
|
|
240
|
-
... PartitionMax("./out", max_size=100_000),
|
|
241
|
-
... ) # doctest: +SKIP
|
|
242
|
-
|
|
243
|
-
See Also
|
|
244
|
-
--------
|
|
245
|
-
PartitionByKey
|
|
246
|
-
PartitionParted
|
|
247
|
-
polars.io.partition.BasePartitionContext
|
|
248
|
-
"""
|
|
249
|
-
|
|
250
|
-
def __init__(
|
|
251
|
-
self,
|
|
252
|
-
base_path: str | Path,
|
|
253
|
-
*,
|
|
254
|
-
file_path: Callable[[BasePartitionContext], Path | str | IO[bytes] | IO[str]]
|
|
255
|
-
| None = None,
|
|
256
|
-
max_size: int,
|
|
257
|
-
per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
|
|
258
|
-
finish_callback: Callable[[DataFrame], None] | None = None,
|
|
259
|
-
) -> None:
|
|
260
|
-
issue_unstable_warning("partitioning strategies are considered unstable.")
|
|
261
|
-
super().__init__(
|
|
262
|
-
PyPartitioning.new_max_size(
|
|
263
|
-
base_path=base_path,
|
|
264
|
-
file_path_cb=_cast_base_file_path_cb(file_path),
|
|
265
|
-
max_size=max_size,
|
|
266
|
-
per_partition_sort_by=_prepare_per_partition_sort_by(
|
|
267
|
-
per_partition_sort_by
|
|
268
|
-
),
|
|
269
|
-
finish_callback=_prepare_finish_callback(finish_callback),
|
|
270
|
-
)
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def _lower_by(
|
|
275
|
-
by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
|
|
276
|
-
) -> list[PyExpr]:
|
|
277
|
-
def to_expr(i: str | Expr) -> Expr:
|
|
278
|
-
if isinstance(i, str):
|
|
279
|
-
return col(i)
|
|
280
|
-
else:
|
|
281
|
-
return i
|
|
282
|
-
|
|
283
|
-
lowered_by: list[PyExpr]
|
|
284
|
-
if isinstance(by, str):
|
|
285
|
-
lowered_by = [col(by)._pyexpr]
|
|
286
|
-
elif isinstance(by, Expr):
|
|
287
|
-
lowered_by = [by._pyexpr]
|
|
288
|
-
elif isinstance(by, Sequence):
|
|
289
|
-
lowered_by = [to_expr(e)._pyexpr for e in by]
|
|
290
|
-
elif isinstance(by, Mapping):
|
|
291
|
-
lowered_by = [e.alias(n)._pyexpr for n, e in by.items()]
|
|
292
|
-
else:
|
|
293
|
-
msg = "invalid `by` type"
|
|
294
|
-
raise TypeError(msg)
|
|
295
|
-
|
|
296
|
-
return lowered_by
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
class PartitionByKey(PartitioningScheme):
|
|
300
|
-
"""
|
|
301
|
-
Partitioning scheme to write files split by the values of keys.
|
|
302
|
-
|
|
303
|
-
This partitioning scheme generates an arbitrary amount of files splitting
|
|
304
|
-
the data depending on what the value is of key expressions.
|
|
305
|
-
|
|
306
|
-
The amount of files that can be written is not limited. However, when
|
|
307
|
-
writing beyond a certain amount of files, the data for the remaining
|
|
308
|
-
partitions is buffered before writing to the file.
|
|
309
|
-
|
|
310
|
-
.. warning::
|
|
311
|
-
This functionality is currently considered **unstable**. It may be
|
|
312
|
-
changed at any point without it being considered a breaking change.
|
|
313
|
-
|
|
314
|
-
Parameters
|
|
315
|
-
----------
|
|
316
|
-
base_path
|
|
317
|
-
The base path for the output files.
|
|
318
|
-
|
|
319
|
-
Use the `mkdir` option on the `sink_*` methods to ensure directories in
|
|
320
|
-
the path are created.
|
|
321
|
-
file_path
|
|
322
|
-
A callback to register or modify the output path for each partition
|
|
323
|
-
relative to the `base_path`. The callback provides a
|
|
324
|
-
:class:`polars.io.partition.KeyedPartitionContext` that contains information
|
|
325
|
-
about the partition.
|
|
326
|
-
|
|
327
|
-
If no callback is given, it defaults to
|
|
328
|
-
`{ctx.keys.hive_dirs()}/{ctx.in_part_idx}.{EXT}`.
|
|
329
|
-
by
|
|
330
|
-
The expressions to partition by.
|
|
331
|
-
include_key : bool
|
|
332
|
-
Whether to include the key columns in the output files.
|
|
333
|
-
per_partition_sort_by
|
|
334
|
-
Columns or expressions to sort over within each partition.
|
|
335
|
-
|
|
336
|
-
Note that this might increase the memory consumption needed for each partition.
|
|
337
|
-
finish_callback
|
|
338
|
-
A callback that gets called when the query finishes successfully.
|
|
339
|
-
|
|
340
|
-
For parquet files, the callback is given a dataframe with metrics about all
|
|
341
|
-
files written files.
|
|
342
|
-
|
|
343
|
-
Examples
|
|
344
|
-
--------
|
|
345
|
-
Split into a hive-partitioning style partition:
|
|
346
|
-
|
|
347
|
-
>>> (
|
|
348
|
-
... pl.DataFrame({"a": [1, 2, 3], "b": [5, 7, 9], "c": ["A", "B", "C"]})
|
|
349
|
-
... .lazy()
|
|
350
|
-
... .sink_parquet(
|
|
351
|
-
... PartitionByKey(
|
|
352
|
-
... "./out",
|
|
353
|
-
... by=[pl.col.a, pl.col.b],
|
|
354
|
-
... include_key=False,
|
|
355
|
-
... ),
|
|
356
|
-
... mkdir=True,
|
|
357
|
-
... )
|
|
358
|
-
... ) # doctest: +SKIP
|
|
359
|
-
|
|
360
|
-
Split a parquet file by a column `year` into CSV files:
|
|
361
|
-
|
|
362
|
-
>>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
|
|
363
|
-
... PartitionByKey(
|
|
364
|
-
... "./out/",
|
|
365
|
-
... file_path=lambda ctx: f"year={ctx.keys[0].str_value}.csv",
|
|
366
|
-
... by="year",
|
|
367
|
-
... ),
|
|
368
|
-
... ) # doctest: +SKIP
|
|
369
|
-
|
|
370
|
-
See Also
|
|
371
|
-
--------
|
|
372
|
-
PartitionMaxSize
|
|
373
|
-
PartitionParted
|
|
374
|
-
polars.io.partition.KeyedPartitionContext
|
|
375
|
-
"""
|
|
376
|
-
|
|
377
|
-
def __init__(
|
|
378
|
-
self,
|
|
379
|
-
base_path: str | Path,
|
|
380
|
-
*,
|
|
381
|
-
file_path: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
|
|
382
|
-
| None = None,
|
|
383
|
-
by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
|
|
384
|
-
include_key: bool = True,
|
|
385
|
-
per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
|
|
386
|
-
finish_callback: Callable[[DataFrame], None] | None = None,
|
|
387
|
-
) -> None:
|
|
388
|
-
issue_unstable_warning("partitioning strategies are considered unstable.")
|
|
389
|
-
|
|
390
|
-
lowered_by = _lower_by(by)
|
|
391
|
-
super().__init__(
|
|
392
|
-
PyPartitioning.new_by_key(
|
|
393
|
-
base_path=base_path,
|
|
394
|
-
file_path_cb=_cast_keyed_file_path_cb(file_path),
|
|
395
|
-
by=lowered_by,
|
|
396
|
-
include_key=include_key,
|
|
397
|
-
per_partition_sort_by=_prepare_per_partition_sort_by(
|
|
398
|
-
per_partition_sort_by
|
|
399
|
-
),
|
|
400
|
-
finish_callback=_prepare_finish_callback(finish_callback),
|
|
401
|
-
)
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
class PartitionParted(PartitioningScheme):
|
|
406
|
-
"""
|
|
407
|
-
Partitioning scheme to split parted dataframes.
|
|
408
|
-
|
|
409
|
-
This is a specialized version of :class:`PartitionByKey`. Where as
|
|
410
|
-
:class:`PartitionByKey` accepts data in any order, this scheme expects the input
|
|
411
|
-
data to be pre-grouped or pre-sorted. This scheme suffers a lot less overhead than
|
|
412
|
-
:class:`PartitionByKey`, but may not be always applicable.
|
|
413
|
-
|
|
414
|
-
Each new value of the key expressions starts a new partition, therefore repeating
|
|
415
|
-
the same value multiple times may overwrite previous partitions.
|
|
416
|
-
|
|
417
|
-
.. warning::
|
|
418
|
-
This functionality is currently considered **unstable**. It may be
|
|
419
|
-
changed at any point without it being considered a breaking change.
|
|
420
|
-
|
|
421
|
-
Parameters
|
|
422
|
-
----------
|
|
423
|
-
base_path
|
|
424
|
-
The base path for the output files.
|
|
425
|
-
|
|
426
|
-
Use the `mkdir` option on the `sink_*` methods to ensure directories in
|
|
427
|
-
the path are created.
|
|
428
|
-
file_path
|
|
429
|
-
A callback to register or modify the output path for each partition
|
|
430
|
-
relative to the `base_path`.The callback provides a
|
|
431
|
-
:class:`polars.io.partition.KeyedPartitionContext` that contains information
|
|
432
|
-
about the partition.
|
|
433
|
-
|
|
434
|
-
If no callback is given, it defaults to
|
|
435
|
-
`{ctx.keys.hive_dirs()}/{ctx.in_part_idx}.{EXT}`.
|
|
436
|
-
by
|
|
437
|
-
The expressions to partition by.
|
|
438
|
-
include_key : bool
|
|
439
|
-
Whether to include the key columns in the output files.
|
|
440
|
-
per_partition_sort_by
|
|
441
|
-
Columns or expressions to sort over within each partition.
|
|
442
|
-
|
|
443
|
-
Note that this might increase the memory consumption needed for each partition.
|
|
444
|
-
finish_callback
|
|
445
|
-
A callback that gets called when the query finishes successfully.
|
|
446
|
-
|
|
447
|
-
For parquet files, the callback is given a dataframe with metrics about all
|
|
448
|
-
files written files.
|
|
449
|
-
|
|
450
|
-
Examples
|
|
451
|
-
--------
|
|
452
|
-
Split a parquet file by a column `year` into CSV files:
|
|
453
|
-
|
|
454
|
-
>>> pl.scan_parquet("/path/to/file.parquet").sink_csv(
|
|
455
|
-
... PartitionParted("./out", by="year"),
|
|
456
|
-
... mkdir=True,
|
|
457
|
-
... ) # doctest: +SKIP
|
|
458
|
-
|
|
459
|
-
See Also
|
|
460
|
-
--------
|
|
461
|
-
PartitionMaxSize
|
|
462
|
-
PartitionByKey
|
|
463
|
-
polars.io.partition.KeyedPartitionContext
|
|
464
|
-
"""
|
|
465
|
-
|
|
466
|
-
def __init__(
|
|
467
|
-
self,
|
|
468
|
-
base_path: str | Path,
|
|
469
|
-
*,
|
|
470
|
-
file_path: Callable[[KeyedPartitionContext], Path | str | IO[bytes] | IO[str]]
|
|
471
|
-
| None = None,
|
|
472
|
-
by: str | Expr | Sequence[str | Expr] | Mapping[str, Expr],
|
|
473
|
-
include_key: bool = True,
|
|
474
|
-
per_partition_sort_by: str | Expr | Iterable[str | Expr] | None = None,
|
|
475
|
-
finish_callback: Callable[[DataFrame], None] | None = None,
|
|
476
|
-
) -> None:
|
|
477
|
-
issue_unstable_warning("partitioning strategies are considered unstable.")
|
|
478
|
-
|
|
479
|
-
lowered_by = _lower_by(by)
|
|
480
|
-
super().__init__(
|
|
481
|
-
PyPartitioning.new_by_key(
|
|
482
|
-
base_path=base_path,
|
|
483
|
-
file_path_cb=_cast_keyed_file_path_cb(file_path),
|
|
484
|
-
by=lowered_by,
|
|
485
|
-
include_key=include_key,
|
|
486
|
-
per_partition_sort_by=_prepare_per_partition_sort_by(
|
|
487
|
-
per_partition_sort_by
|
|
488
|
-
),
|
|
489
|
-
finish_callback=_prepare_finish_callback(finish_callback),
|
|
490
|
-
)
|
|
491
|
-
)
|
polars/io/plugins.py
DELETED
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import sys
|
|
5
|
-
from collections.abc import Iterator
|
|
6
|
-
from typing import TYPE_CHECKING, Callable
|
|
7
|
-
|
|
8
|
-
import polars._reexport as pl
|
|
9
|
-
from polars._utils.unstable import unstable
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from collections.abc import Iterator
|
|
13
|
-
from typing import Callable
|
|
14
|
-
|
|
15
|
-
from polars import DataFrame, Expr, LazyFrame
|
|
16
|
-
from polars._typing import SchemaDict
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@unstable()
|
|
20
|
-
def register_io_source(
|
|
21
|
-
io_source: Callable[
|
|
22
|
-
[list[str] | None, Expr | None, int | None, int | None], Iterator[DataFrame]
|
|
23
|
-
],
|
|
24
|
-
*,
|
|
25
|
-
schema: Callable[[], SchemaDict] | SchemaDict,
|
|
26
|
-
validate_schema: bool = False,
|
|
27
|
-
is_pure: bool = False,
|
|
28
|
-
) -> LazyFrame:
|
|
29
|
-
"""
|
|
30
|
-
Register your IO plugin and initialize a LazyFrame.
|
|
31
|
-
|
|
32
|
-
See the `user guide <https://docs.pola.rs/user-guide/plugins/io_plugins>`_
|
|
33
|
-
for more information about plugins.
|
|
34
|
-
|
|
35
|
-
.. warning::
|
|
36
|
-
This functionality is considered **unstable**. It may be changed
|
|
37
|
-
at any point without it being considered a breaking change.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
Parameters
|
|
41
|
-
----------
|
|
42
|
-
io_source
|
|
43
|
-
Function that accepts the following arguments:
|
|
44
|
-
with_columns
|
|
45
|
-
Columns that are projected. The reader must
|
|
46
|
-
project these columns if applied
|
|
47
|
-
predicate
|
|
48
|
-
Polars expression. The reader must filter
|
|
49
|
-
their rows accordingly.
|
|
50
|
-
n_rows
|
|
51
|
-
Materialize only n rows from the source.
|
|
52
|
-
The reader can stop when `n_rows` are read.
|
|
53
|
-
batch_size
|
|
54
|
-
A hint of the ideal batch size the reader's
|
|
55
|
-
generator must produce.
|
|
56
|
-
|
|
57
|
-
The function should return a an iterator/generator
|
|
58
|
-
that produces DataFrames.
|
|
59
|
-
schema
|
|
60
|
-
Schema or function that when called produces the schema that the reader
|
|
61
|
-
will produce before projection pushdown.
|
|
62
|
-
validate_schema
|
|
63
|
-
Whether the engine should validate if the batches generated match
|
|
64
|
-
the given schema. It's an implementation error if this isn't
|
|
65
|
-
the case and can lead to bugs that are hard to solve.
|
|
66
|
-
is_pure
|
|
67
|
-
Whether the IO source is pure. Repeated occurrences of same IO source in
|
|
68
|
-
a LazyFrame plan can be de-duplicated during optimization if they are
|
|
69
|
-
pure.
|
|
70
|
-
|
|
71
|
-
Returns
|
|
72
|
-
-------
|
|
73
|
-
LazyFrame
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
def wrap(
|
|
77
|
-
with_columns: list[str] | None,
|
|
78
|
-
predicate: bytes | None,
|
|
79
|
-
n_rows: int | None,
|
|
80
|
-
batch_size: int | None,
|
|
81
|
-
) -> tuple[Iterator[DataFrame], bool]:
|
|
82
|
-
parsed_predicate_success = True
|
|
83
|
-
parsed_predicate = None
|
|
84
|
-
if predicate:
|
|
85
|
-
try:
|
|
86
|
-
parsed_predicate = pl.Expr.deserialize(predicate)
|
|
87
|
-
except Exception as e:
|
|
88
|
-
if os.environ.get("POLARS_VERBOSE"):
|
|
89
|
-
print(
|
|
90
|
-
f"failed parsing IO plugin expression\n\nfilter will be handled on Polars' side: {e}",
|
|
91
|
-
file=sys.stderr,
|
|
92
|
-
)
|
|
93
|
-
parsed_predicate_success = False
|
|
94
|
-
|
|
95
|
-
return io_source(
|
|
96
|
-
with_columns, parsed_predicate, n_rows, batch_size
|
|
97
|
-
), parsed_predicate_success
|
|
98
|
-
|
|
99
|
-
return pl.LazyFrame._scan_python_function(
|
|
100
|
-
schema=schema,
|
|
101
|
-
scan_fn=wrap,
|
|
102
|
-
pyarrow=False,
|
|
103
|
-
validate_schema=validate_schema,
|
|
104
|
-
is_pure=is_pure,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
@unstable()
|
|
109
|
-
def _defer(
|
|
110
|
-
function: Callable[[], DataFrame],
|
|
111
|
-
*,
|
|
112
|
-
schema: SchemaDict | Callable[[], SchemaDict],
|
|
113
|
-
validate_schema: bool = True,
|
|
114
|
-
) -> LazyFrame:
|
|
115
|
-
"""
|
|
116
|
-
Deferred execution.
|
|
117
|
-
|
|
118
|
-
Takes a function that produces a `DataFrame` but defers execution until the
|
|
119
|
-
`LazyFrame` is collected.
|
|
120
|
-
|
|
121
|
-
Parameters
|
|
122
|
-
----------
|
|
123
|
-
function
|
|
124
|
-
Function that takes no arguments and produces a `DataFrame`.
|
|
125
|
-
schema
|
|
126
|
-
Schema of the `DataFrame` the deferred function will return.
|
|
127
|
-
The caller must ensure this schema is correct.
|
|
128
|
-
validate_schema
|
|
129
|
-
Whether the engine should validate if the batches generated match
|
|
130
|
-
the given schema. It's an implementation error if this isn't
|
|
131
|
-
the case and can lead to bugs that are hard to solve.
|
|
132
|
-
|
|
133
|
-
Examples
|
|
134
|
-
--------
|
|
135
|
-
Delay DataFrame execution until query is executed.
|
|
136
|
-
|
|
137
|
-
>>> import numpy as np
|
|
138
|
-
>>> np.random.seed(0)
|
|
139
|
-
>>> lf = pl.defer(
|
|
140
|
-
... lambda: pl.DataFrame({"a": np.random.randn(3)}), schema={"a": pl.Float64}
|
|
141
|
-
... )
|
|
142
|
-
>>> lf.collect()
|
|
143
|
-
shape: (3, 1)
|
|
144
|
-
┌──────────┐
|
|
145
|
-
│ a │
|
|
146
|
-
│ --- │
|
|
147
|
-
│ f64 │
|
|
148
|
-
╞══════════╡
|
|
149
|
-
│ 1.764052 │
|
|
150
|
-
│ 0.400157 │
|
|
151
|
-
│ 0.978738 │
|
|
152
|
-
└──────────┘
|
|
153
|
-
|
|
154
|
-
Run an eager source in Polars Cloud
|
|
155
|
-
|
|
156
|
-
>>> (
|
|
157
|
-
... pl.defer(
|
|
158
|
-
... lambda: pl.read_database("select * from tbl"),
|
|
159
|
-
... schema={"a": pl.Float64, "b": pl.Boolean},
|
|
160
|
-
... )
|
|
161
|
-
... .filter("b")
|
|
162
|
-
... .sum("a")
|
|
163
|
-
... .remote()
|
|
164
|
-
... .collect()
|
|
165
|
-
... ) # doctest: +SKIP
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
"""
|
|
169
|
-
|
|
170
|
-
def source(
|
|
171
|
-
with_columns: list[str] | None,
|
|
172
|
-
predicate: Expr | None,
|
|
173
|
-
n_rows: int | None,
|
|
174
|
-
batch_size: int | None,
|
|
175
|
-
) -> Iterator[DataFrame]:
|
|
176
|
-
lf = function().lazy()
|
|
177
|
-
if with_columns is not None:
|
|
178
|
-
lf = lf.select(with_columns)
|
|
179
|
-
if predicate is not None:
|
|
180
|
-
lf = lf.filter(predicate)
|
|
181
|
-
if n_rows is not None:
|
|
182
|
-
lf = lf.limit(n_rows)
|
|
183
|
-
yield lf.collect()
|
|
184
|
-
|
|
185
|
-
return register_io_source(
|
|
186
|
-
io_source=source, schema=schema, validate_schema=validate_schema
|
|
187
|
-
)
|