cudf-polars-cu12 24.8.0a281__py3-none-any.whl → 25.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/__init__.py +9 -3
- cudf_polars/callback.py +258 -23
- cudf_polars/containers/__init__.py +2 -2
- cudf_polars/containers/column.py +167 -66
- cudf_polars/containers/dataframe.py +157 -58
- cudf_polars/dsl/expr.py +37 -1397
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +246 -0
- cudf_polars/dsl/expressions/base.py +300 -0
- cudf_polars/dsl/expressions/binaryop.py +135 -0
- cudf_polars/dsl/expressions/boolean.py +312 -0
- cudf_polars/dsl/expressions/datetime.py +196 -0
- cudf_polars/dsl/expressions/literal.py +91 -0
- cudf_polars/dsl/expressions/rolling.py +40 -0
- cudf_polars/dsl/expressions/selection.py +92 -0
- cudf_polars/dsl/expressions/sorting.py +97 -0
- cudf_polars/dsl/expressions/string.py +362 -0
- cudf_polars/dsl/expressions/ternary.py +53 -0
- cudf_polars/dsl/expressions/unary.py +339 -0
- cudf_polars/dsl/ir.py +1202 -427
- cudf_polars/dsl/nodebase.py +150 -0
- cudf_polars/dsl/to_ast.py +318 -0
- cudf_polars/dsl/translate.py +398 -181
- cudf_polars/dsl/traversal.py +175 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +43 -0
- cudf_polars/experimental/dask_serialize.py +59 -0
- cudf_polars/experimental/dispatch.py +84 -0
- cudf_polars/experimental/io.py +325 -0
- cudf_polars/experimental/parallel.py +253 -0
- cudf_polars/experimental/select.py +36 -0
- cudf_polars/testing/asserts.py +139 -19
- cudf_polars/testing/plugin.py +242 -0
- cudf_polars/typing/__init__.py +51 -10
- cudf_polars/utils/dtypes.py +88 -39
- cudf_polars/utils/sorting.py +2 -2
- cudf_polars/utils/versions.py +22 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +15 -12
- cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu12-24.8.0a281.dist-info/RECORD +0 -23
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
25.02.00
|
cudf_polars/__init__.py
CHANGED
|
@@ -12,11 +12,17 @@ from __future__ import annotations
|
|
|
12
12
|
|
|
13
13
|
from cudf_polars._version import __git_commit__, __version__
|
|
14
14
|
from cudf_polars.callback import execute_with_cudf
|
|
15
|
-
from cudf_polars.dsl.translate import
|
|
15
|
+
from cudf_polars.dsl.translate import Translator
|
|
16
|
+
|
|
17
|
+
# Check we have a supported polars version
|
|
18
|
+
from cudf_polars.utils.versions import _ensure_polars_version
|
|
19
|
+
|
|
20
|
+
_ensure_polars_version()
|
|
21
|
+
del _ensure_polars_version
|
|
16
22
|
|
|
17
23
|
__all__: list[str] = [
|
|
18
|
-
"
|
|
19
|
-
"translate_ir",
|
|
24
|
+
"Translator",
|
|
20
25
|
"__git_commit__",
|
|
21
26
|
"__version__",
|
|
27
|
+
"execute_with_cudf",
|
|
22
28
|
]
|
cudf_polars/callback.py
CHANGED
|
@@ -5,15 +5,27 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
import contextlib
|
|
9
|
+
import os
|
|
10
|
+
import warnings
|
|
11
|
+
from functools import cache, partial
|
|
12
|
+
from typing import TYPE_CHECKING, Literal
|
|
10
13
|
|
|
11
14
|
import nvtx
|
|
12
15
|
|
|
13
|
-
from
|
|
16
|
+
from polars.exceptions import ComputeError, PerformanceWarning
|
|
17
|
+
|
|
18
|
+
import pylibcudf
|
|
19
|
+
import rmm
|
|
20
|
+
from rmm._cuda import gpu
|
|
21
|
+
|
|
22
|
+
from cudf_polars.dsl.translate import Translator
|
|
14
23
|
|
|
15
24
|
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Generator
|
|
26
|
+
|
|
16
27
|
import polars as pl
|
|
28
|
+
from polars import GPUEngine
|
|
17
29
|
|
|
18
30
|
from cudf_polars.dsl.ir import IR
|
|
19
31
|
from cudf_polars.typing import NodeTraverser
|
|
@@ -21,25 +33,214 @@ if TYPE_CHECKING:
|
|
|
21
33
|
__all__: list[str] = ["execute_with_cudf"]
|
|
22
34
|
|
|
23
35
|
|
|
36
|
+
_SUPPORTED_PREFETCHES = {
|
|
37
|
+
"column_view::get_data",
|
|
38
|
+
"mutable_column_view::get_data",
|
|
39
|
+
"gather",
|
|
40
|
+
"hash_join",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _env_get_int(name, default):
|
|
45
|
+
try:
|
|
46
|
+
return int(os.getenv(name, default))
|
|
47
|
+
except (ValueError, TypeError): # pragma: no cover
|
|
48
|
+
return default # pragma: no cover
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@cache
|
|
52
|
+
def default_memory_resource(
|
|
53
|
+
device: int,
|
|
54
|
+
cuda_managed_memory: bool, # noqa: FBT001
|
|
55
|
+
) -> rmm.mr.DeviceMemoryResource:
|
|
56
|
+
"""
|
|
57
|
+
Return the default memory resource for cudf-polars.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
device
|
|
62
|
+
Disambiguating device id when selecting the device. Must be
|
|
63
|
+
the active device when this function is called.
|
|
64
|
+
cuda_managed_memory
|
|
65
|
+
Whether to use managed memory or not.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
rmm.mr.DeviceMemoryResource
|
|
70
|
+
The default memory resource that cudf-polars uses. Currently
|
|
71
|
+
a managed memory resource, if `cuda_managed_memory` is `True`.
|
|
72
|
+
else, an async pool resource is returned.
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
if (
|
|
76
|
+
cuda_managed_memory
|
|
77
|
+
and pylibcudf.utils._is_concurrent_managed_access_supported()
|
|
78
|
+
):
|
|
79
|
+
# Allocating 80% of the available memory for the pool.
|
|
80
|
+
# Leaving a 20% headroom to avoid OOM errors.
|
|
81
|
+
free_memory, _ = rmm.mr.available_device_memory()
|
|
82
|
+
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
|
|
83
|
+
for key in _SUPPORTED_PREFETCHES:
|
|
84
|
+
pylibcudf.experimental.enable_prefetching(key)
|
|
85
|
+
mr = rmm.mr.PrefetchResourceAdaptor(
|
|
86
|
+
rmm.mr.PoolMemoryResource(
|
|
87
|
+
rmm.mr.ManagedMemoryResource(),
|
|
88
|
+
initial_pool_size=free_memory,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
mr = rmm.mr.CudaAsyncMemoryResource()
|
|
93
|
+
except RuntimeError as e: # pragma: no cover
|
|
94
|
+
msg, *_ = e.args
|
|
95
|
+
if (
|
|
96
|
+
msg.startswith("RMM failure")
|
|
97
|
+
and msg.find("not supported with this CUDA driver/runtime version") > -1
|
|
98
|
+
):
|
|
99
|
+
raise ComputeError(
|
|
100
|
+
"GPU engine requested, but incorrect cudf-polars package installed. "
|
|
101
|
+
"If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` "
|
|
102
|
+
"and install `cudf-polars-cu11`"
|
|
103
|
+
) from None
|
|
104
|
+
else:
|
|
105
|
+
raise
|
|
106
|
+
else:
|
|
107
|
+
return mr
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@contextlib.contextmanager
|
|
111
|
+
def set_memory_resource(
|
|
112
|
+
mr: rmm.mr.DeviceMemoryResource | None,
|
|
113
|
+
) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
|
|
114
|
+
"""
|
|
115
|
+
Set the current memory resource for an execution block.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
mr
|
|
120
|
+
Memory resource to use. If `None`, calls :func:`default_memory_resource`
|
|
121
|
+
to obtain an mr on the currently active device.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
Memory resource used.
|
|
126
|
+
|
|
127
|
+
Notes
|
|
128
|
+
-----
|
|
129
|
+
At exit, the memory resource is restored to whatever was current
|
|
130
|
+
at entry. If a memory resource is provided, it must be valid to
|
|
131
|
+
use with the currently active device.
|
|
132
|
+
"""
|
|
133
|
+
previous = rmm.mr.get_current_device_resource()
|
|
134
|
+
if mr is None:
|
|
135
|
+
device: int = gpu.getDevice()
|
|
136
|
+
mr = default_memory_resource(
|
|
137
|
+
device=device,
|
|
138
|
+
cuda_managed_memory=bool(
|
|
139
|
+
_env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
rmm.mr.set_current_device_resource(mr)
|
|
143
|
+
try:
|
|
144
|
+
yield mr
|
|
145
|
+
finally:
|
|
146
|
+
rmm.mr.set_current_device_resource(previous)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@contextlib.contextmanager
|
|
150
|
+
def set_device(device: int | None) -> Generator[int, None, None]:
|
|
151
|
+
"""
|
|
152
|
+
Set the device the query is executed on.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
device
|
|
157
|
+
Device to use. If `None`, uses the current device.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
Device active for the execution of the block.
|
|
162
|
+
|
|
163
|
+
Notes
|
|
164
|
+
-----
|
|
165
|
+
At exit, the device is restored to whatever was current at entry.
|
|
166
|
+
"""
|
|
167
|
+
previous: int = gpu.getDevice()
|
|
168
|
+
if device is not None:
|
|
169
|
+
gpu.setDevice(device)
|
|
170
|
+
try:
|
|
171
|
+
yield previous
|
|
172
|
+
finally:
|
|
173
|
+
gpu.setDevice(previous)
|
|
174
|
+
|
|
175
|
+
|
|
24
176
|
def _callback(
|
|
25
177
|
ir: IR,
|
|
26
178
|
with_columns: list[str] | None,
|
|
27
179
|
pyarrow_predicate: str | None,
|
|
28
180
|
n_rows: int | None,
|
|
181
|
+
*,
|
|
182
|
+
device: int | None,
|
|
183
|
+
memory_resource: int | None,
|
|
184
|
+
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
29
185
|
) -> pl.DataFrame:
|
|
30
186
|
assert with_columns is None
|
|
31
187
|
assert pyarrow_predicate is None
|
|
32
188
|
assert n_rows is None
|
|
33
|
-
with
|
|
34
|
-
|
|
189
|
+
with (
|
|
190
|
+
nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
|
|
191
|
+
# Device must be set before memory resource is obtained.
|
|
192
|
+
set_device(device),
|
|
193
|
+
set_memory_resource(memory_resource),
|
|
194
|
+
):
|
|
195
|
+
if executor is None or executor == "pylibcudf":
|
|
196
|
+
return ir.evaluate(cache={}).to_polars()
|
|
197
|
+
elif executor == "dask-experimental":
|
|
198
|
+
from cudf_polars.experimental.parallel import evaluate_dask
|
|
35
199
|
|
|
200
|
+
return evaluate_dask(ir).to_polars()
|
|
201
|
+
else:
|
|
202
|
+
raise ValueError(f"Unknown executor '{executor}'")
|
|
36
203
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
204
|
+
|
|
205
|
+
def validate_config_options(config: dict) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Validate the configuration options for the GPU engine.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
config
|
|
212
|
+
Configuration options to validate.
|
|
213
|
+
|
|
214
|
+
Raises
|
|
215
|
+
------
|
|
216
|
+
ValueError
|
|
217
|
+
If the configuration contains unsupported options.
|
|
218
|
+
"""
|
|
219
|
+
if unsupported := (
|
|
220
|
+
config.keys()
|
|
221
|
+
- {"raise_on_fail", "parquet_options", "executor", "executor_options"}
|
|
222
|
+
):
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Engine configuration contains unsupported settings: {unsupported}"
|
|
225
|
+
)
|
|
226
|
+
assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
|
|
227
|
+
config.get("parquet_options", {})
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Validate executor_options
|
|
231
|
+
executor = config.get("executor", "pylibcudf")
|
|
232
|
+
if executor == "dask-experimental":
|
|
233
|
+
unsupported = config.get("executor_options", {}).keys() - {
|
|
234
|
+
"max_rows_per_partition",
|
|
235
|
+
"parquet_blocksize",
|
|
236
|
+
}
|
|
237
|
+
else:
|
|
238
|
+
unsupported = config.get("executor_options", {}).keys()
|
|
239
|
+
if unsupported:
|
|
240
|
+
raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
43
244
|
"""
|
|
44
245
|
A post optimization callback that attempts to execute the plan with cudf.
|
|
45
246
|
|
|
@@ -48,19 +249,53 @@ def execute_with_cudf(
|
|
|
48
249
|
nt
|
|
49
250
|
NodeTraverser
|
|
50
251
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
without setting a callback.
|
|
252
|
+
config
|
|
253
|
+
GPUEngine configuration object
|
|
54
254
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
255
|
+
Raises
|
|
256
|
+
------
|
|
257
|
+
ValueError
|
|
258
|
+
If the config contains unsupported keys.
|
|
259
|
+
NotImplementedError
|
|
260
|
+
If translation of the plan is unsupported.
|
|
58
261
|
|
|
262
|
+
Notes
|
|
263
|
+
-----
|
|
59
264
|
The NodeTraverser is mutated if the libcudf executor can handle the plan.
|
|
60
265
|
"""
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
266
|
+
device = config.device
|
|
267
|
+
memory_resource = config.memory_resource
|
|
268
|
+
raise_on_fail = config.config.get("raise_on_fail", False)
|
|
269
|
+
executor = config.config.get("executor", None)
|
|
270
|
+
validate_config_options(config.config)
|
|
271
|
+
|
|
272
|
+
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
|
|
273
|
+
translator = Translator(nt, config)
|
|
274
|
+
ir = translator.translate_ir()
|
|
275
|
+
ir_translation_errors = translator.errors
|
|
276
|
+
if len(ir_translation_errors):
|
|
277
|
+
# TODO: Display these errors in user-friendly way.
|
|
278
|
+
# tracked in https://github.com/rapidsai/cudf/issues/17051
|
|
279
|
+
unique_errors = sorted(set(ir_translation_errors), key=str)
|
|
280
|
+
formatted_errors = "\n".join(
|
|
281
|
+
f"- {e.__class__.__name__}: {e}" for e in unique_errors
|
|
282
|
+
)
|
|
283
|
+
error_message = (
|
|
284
|
+
"Query execution with GPU not possible: unsupported operations."
|
|
285
|
+
f"\nThe errors were:\n{formatted_errors}"
|
|
286
|
+
)
|
|
287
|
+
exception = NotImplementedError(error_message, unique_errors)
|
|
288
|
+
if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
|
|
289
|
+
warnings.warn(error_message, PerformanceWarning, stacklevel=2)
|
|
290
|
+
if raise_on_fail:
|
|
291
|
+
raise exception
|
|
292
|
+
else:
|
|
293
|
+
nt.set_udf(
|
|
294
|
+
partial(
|
|
295
|
+
_callback,
|
|
296
|
+
ir,
|
|
297
|
+
device=device,
|
|
298
|
+
memory_resource=memory_resource,
|
|
299
|
+
executor=executor,
|
|
300
|
+
)
|
|
301
|
+
)
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
__all__: list[str] = ["
|
|
8
|
+
__all__: list[str] = ["Column", "DataFrame"]
|
|
9
9
|
|
|
10
|
-
from cudf_polars.containers.column import Column
|
|
10
|
+
from cudf_polars.containers.column import Column
|
|
11
11
|
from cudf_polars.containers.dataframe import DataFrame
|
cudf_polars/containers/column.py
CHANGED
|
@@ -8,12 +8,25 @@ from __future__ import annotations
|
|
|
8
8
|
import functools
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
from polars.exceptions import InvalidOperationError
|
|
12
|
+
|
|
13
|
+
import pylibcudf as plc
|
|
14
|
+
from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats
|
|
15
|
+
from pylibcudf.strings.convert.convert_integers import (
|
|
16
|
+
from_integers,
|
|
17
|
+
is_integer,
|
|
18
|
+
to_integers,
|
|
19
|
+
)
|
|
20
|
+
from pylibcudf.traits import is_floating_point
|
|
21
|
+
|
|
22
|
+
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
12
23
|
|
|
13
24
|
if TYPE_CHECKING:
|
|
14
25
|
from typing_extensions import Self
|
|
15
26
|
|
|
16
|
-
|
|
27
|
+
import polars as pl
|
|
28
|
+
|
|
29
|
+
__all__: list[str] = ["Column"]
|
|
17
30
|
|
|
18
31
|
|
|
19
32
|
class Column:
|
|
@@ -24,6 +37,9 @@ class Column:
|
|
|
24
37
|
order: plc.types.Order
|
|
25
38
|
null_order: plc.types.NullOrder
|
|
26
39
|
is_scalar: bool
|
|
40
|
+
# Optional name, only ever set by evaluation of NamedExpr nodes
|
|
41
|
+
# The internal evaluation should not care about the name.
|
|
42
|
+
name: str | None
|
|
27
43
|
|
|
28
44
|
def __init__(
|
|
29
45
|
self,
|
|
@@ -32,14 +48,12 @@ class Column:
|
|
|
32
48
|
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
|
|
33
49
|
order: plc.types.Order = plc.types.Order.ASCENDING,
|
|
34
50
|
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
|
|
51
|
+
name: str | None = None,
|
|
35
52
|
):
|
|
36
53
|
self.obj = column
|
|
37
54
|
self.is_scalar = self.obj.size() == 1
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
self.is_sorted = is_sorted
|
|
41
|
-
self.order = order
|
|
42
|
-
self.null_order = null_order
|
|
55
|
+
self.name = name
|
|
56
|
+
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
|
|
43
57
|
|
|
44
58
|
@functools.cached_property
|
|
45
59
|
def obj_scalar(self) -> plc.Scalar:
|
|
@@ -61,9 +75,26 @@ class Column:
|
|
|
61
75
|
)
|
|
62
76
|
return plc.copying.get_element(self.obj, 0)
|
|
63
77
|
|
|
78
|
+
def rename(self, name: str | None, /) -> Self:
|
|
79
|
+
"""
|
|
80
|
+
Return a shallow copy with a new name.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
name
|
|
85
|
+
New name
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
Shallow copy of self with new name set.
|
|
90
|
+
"""
|
|
91
|
+
new = self.copy()
|
|
92
|
+
new.name = name
|
|
93
|
+
return new
|
|
94
|
+
|
|
64
95
|
def sorted_like(self, like: Column, /) -> Self:
|
|
65
96
|
"""
|
|
66
|
-
|
|
97
|
+
Return a shallow copy with sortedness from like.
|
|
67
98
|
|
|
68
99
|
Parameters
|
|
69
100
|
----------
|
|
@@ -72,16 +103,122 @@ class Column:
|
|
|
72
103
|
|
|
73
104
|
Returns
|
|
74
105
|
-------
|
|
75
|
-
|
|
106
|
+
Shallow copy of self with metadata set.
|
|
76
107
|
|
|
77
108
|
See Also
|
|
78
109
|
--------
|
|
79
|
-
set_sorted
|
|
110
|
+
set_sorted, copy_metadata
|
|
80
111
|
"""
|
|
81
|
-
return self
|
|
82
|
-
|
|
112
|
+
return type(self)(
|
|
113
|
+
self.obj,
|
|
114
|
+
name=self.name,
|
|
115
|
+
is_sorted=like.is_sorted,
|
|
116
|
+
order=like.order,
|
|
117
|
+
null_order=like.null_order,
|
|
83
118
|
)
|
|
84
119
|
|
|
120
|
+
def astype(self, dtype: plc.DataType) -> Column:
|
|
121
|
+
"""
|
|
122
|
+
Cast the column to as the requested dtype.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
dtype
|
|
127
|
+
Datatype to cast to.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
Column of requested type.
|
|
132
|
+
|
|
133
|
+
Raises
|
|
134
|
+
------
|
|
135
|
+
RuntimeError
|
|
136
|
+
If the cast is unsupported.
|
|
137
|
+
|
|
138
|
+
Notes
|
|
139
|
+
-----
|
|
140
|
+
This only produces a copy if the requested dtype doesn't match
|
|
141
|
+
the current one.
|
|
142
|
+
"""
|
|
143
|
+
if self.obj.type() == dtype:
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING:
|
|
147
|
+
return Column(self._handle_string_cast(dtype))
|
|
148
|
+
else:
|
|
149
|
+
result = Column(plc.unary.cast(self.obj, dtype))
|
|
150
|
+
if is_order_preserving_cast(self.obj.type(), dtype):
|
|
151
|
+
return result.sorted_like(self)
|
|
152
|
+
return result
|
|
153
|
+
|
|
154
|
+
def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column:
|
|
155
|
+
if dtype.id() == plc.TypeId.STRING:
|
|
156
|
+
if is_floating_point(self.obj.type()):
|
|
157
|
+
return from_floats(self.obj)
|
|
158
|
+
else:
|
|
159
|
+
return from_integers(self.obj)
|
|
160
|
+
else:
|
|
161
|
+
if is_floating_point(dtype):
|
|
162
|
+
floats = is_float(self.obj)
|
|
163
|
+
if not plc.interop.to_arrow(
|
|
164
|
+
plc.reduce.reduce(
|
|
165
|
+
floats,
|
|
166
|
+
plc.aggregation.all(),
|
|
167
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
168
|
+
)
|
|
169
|
+
).as_py():
|
|
170
|
+
raise InvalidOperationError("Conversion from `str` failed.")
|
|
171
|
+
return to_floats(self.obj, dtype)
|
|
172
|
+
else:
|
|
173
|
+
integers = is_integer(self.obj)
|
|
174
|
+
if not plc.interop.to_arrow(
|
|
175
|
+
plc.reduce.reduce(
|
|
176
|
+
integers,
|
|
177
|
+
plc.aggregation.all(),
|
|
178
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
179
|
+
)
|
|
180
|
+
).as_py():
|
|
181
|
+
raise InvalidOperationError("Conversion from `str` failed.")
|
|
182
|
+
return to_integers(self.obj, dtype)
|
|
183
|
+
|
|
184
|
+
def copy_metadata(self, from_: pl.Series, /) -> Self:
|
|
185
|
+
"""
|
|
186
|
+
Copy metadata from a host series onto self.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
from_
|
|
191
|
+
Polars series to copy metadata from
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
Self with metadata set.
|
|
196
|
+
|
|
197
|
+
See Also
|
|
198
|
+
--------
|
|
199
|
+
set_sorted, sorted_like
|
|
200
|
+
"""
|
|
201
|
+
self.name = from_.name
|
|
202
|
+
if len(from_) <= 1:
|
|
203
|
+
return self
|
|
204
|
+
ascending = from_.flags["SORTED_ASC"]
|
|
205
|
+
descending = from_.flags["SORTED_DESC"]
|
|
206
|
+
if ascending or descending:
|
|
207
|
+
has_null_first = from_.item(0) is None
|
|
208
|
+
has_null_last = from_.item(-1) is None
|
|
209
|
+
order = (
|
|
210
|
+
plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
|
|
211
|
+
)
|
|
212
|
+
null_order = plc.types.NullOrder.BEFORE
|
|
213
|
+
if (descending and has_null_first) or (ascending and has_null_last):
|
|
214
|
+
null_order = plc.types.NullOrder.AFTER
|
|
215
|
+
return self.set_sorted(
|
|
216
|
+
is_sorted=plc.types.Sorted.YES,
|
|
217
|
+
order=order,
|
|
218
|
+
null_order=null_order,
|
|
219
|
+
)
|
|
220
|
+
return self
|
|
221
|
+
|
|
85
222
|
def set_sorted(
|
|
86
223
|
self,
|
|
87
224
|
*,
|
|
@@ -125,65 +262,29 @@ class Column:
|
|
|
125
262
|
is_sorted=self.is_sorted,
|
|
126
263
|
order=self.order,
|
|
127
264
|
null_order=self.null_order,
|
|
265
|
+
name=self.name,
|
|
128
266
|
)
|
|
129
267
|
|
|
130
268
|
def mask_nans(self) -> Self:
|
|
131
|
-
"""Return a copy of self with nans masked out."""
|
|
132
|
-
if self.
|
|
133
|
-
|
|
269
|
+
"""Return a shallow copy of self with nans masked out."""
|
|
270
|
+
if plc.traits.is_floating_point(self.obj.type()):
|
|
271
|
+
old_count = self.obj.null_count()
|
|
272
|
+
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
273
|
+
result = type(self)(self.obj.with_mask(mask, new_count))
|
|
274
|
+
if old_count == new_count:
|
|
275
|
+
return result.sorted_like(self)
|
|
276
|
+
return result
|
|
134
277
|
return self.copy()
|
|
135
278
|
|
|
136
279
|
@functools.cached_property
|
|
137
280
|
def nan_count(self) -> int:
|
|
138
281
|
"""Return the number of NaN values in the column."""
|
|
139
|
-
if self.obj.type()
|
|
140
|
-
return
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
).as_py()
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
class NamedColumn(Column):
|
|
152
|
-
"""A column with a name."""
|
|
153
|
-
|
|
154
|
-
name: str
|
|
155
|
-
|
|
156
|
-
def __init__(
|
|
157
|
-
self,
|
|
158
|
-
column: plc.Column,
|
|
159
|
-
name: str,
|
|
160
|
-
*,
|
|
161
|
-
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
|
|
162
|
-
order: plc.types.Order = plc.types.Order.ASCENDING,
|
|
163
|
-
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
|
|
164
|
-
) -> None:
|
|
165
|
-
super().__init__(
|
|
166
|
-
column, is_sorted=is_sorted, order=order, null_order=null_order
|
|
167
|
-
)
|
|
168
|
-
self.name = name
|
|
169
|
-
|
|
170
|
-
def copy(self, *, new_name: str | None = None) -> Self:
|
|
171
|
-
"""
|
|
172
|
-
A shallow copy of the column.
|
|
173
|
-
|
|
174
|
-
Parameters
|
|
175
|
-
----------
|
|
176
|
-
new_name
|
|
177
|
-
Optional new name for the copied column.
|
|
178
|
-
|
|
179
|
-
Returns
|
|
180
|
-
-------
|
|
181
|
-
New column sharing data with self.
|
|
182
|
-
"""
|
|
183
|
-
return type(self)(
|
|
184
|
-
self.obj,
|
|
185
|
-
self.name if new_name is None else new_name,
|
|
186
|
-
is_sorted=self.is_sorted,
|
|
187
|
-
order=self.order,
|
|
188
|
-
null_order=self.null_order,
|
|
189
|
-
)
|
|
282
|
+
if plc.traits.is_floating_point(self.obj.type()):
|
|
283
|
+
return plc.interop.to_arrow(
|
|
284
|
+
plc.reduce.reduce(
|
|
285
|
+
plc.unary.is_nan(self.obj),
|
|
286
|
+
plc.aggregation.sum(),
|
|
287
|
+
plc.types.SIZE_TYPE,
|
|
288
|
+
)
|
|
289
|
+
).as_py()
|
|
290
|
+
return 0
|