cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +55 -61
- cudf_polars/containers/__init__.py +4 -2
- cudf_polars/containers/column.py +123 -40
- cudf_polars/containers/dataframe.py +70 -35
- cudf_polars/containers/datatype.py +135 -0
- cudf_polars/dsl/expr.py +2 -0
- cudf_polars/dsl/expressions/aggregation.py +51 -71
- cudf_polars/dsl/expressions/base.py +45 -77
- cudf_polars/dsl/expressions/binaryop.py +29 -44
- cudf_polars/dsl/expressions/boolean.py +64 -71
- cudf_polars/dsl/expressions/datetime.py +70 -34
- cudf_polars/dsl/expressions/literal.py +45 -33
- cudf_polars/dsl/expressions/rolling.py +133 -10
- cudf_polars/dsl/expressions/selection.py +13 -31
- cudf_polars/dsl/expressions/slicing.py +6 -13
- cudf_polars/dsl/expressions/sorting.py +9 -21
- cudf_polars/dsl/expressions/string.py +470 -84
- cudf_polars/dsl/expressions/struct.py +138 -0
- cudf_polars/dsl/expressions/ternary.py +9 -13
- cudf_polars/dsl/expressions/unary.py +151 -90
- cudf_polars/dsl/ir.py +798 -331
- cudf_polars/dsl/nodebase.py +11 -4
- cudf_polars/dsl/to_ast.py +61 -20
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +279 -167
- cudf_polars/dsl/traversal.py +64 -15
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +301 -0
- cudf_polars/dsl/utils/groupby.py +93 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +115 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +112 -8
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +216 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsh.py +812 -0
- cudf_polars/experimental/benchmarks/utils.py +725 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +22 -7
- cudf_polars/experimental/distinct.py +194 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +547 -0
- cudf_polars/experimental/groupby.py +174 -196
- cudf_polars/experimental/io.py +626 -51
- cudf_polars/experimental/join.py +104 -33
- cudf_polars/experimental/parallel.py +219 -133
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +132 -7
- cudf_polars/experimental/shuffle.py +126 -18
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +112 -0
- cudf_polars/testing/asserts.py +213 -14
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +77 -67
- cudf_polars/typing/__init__.py +63 -22
- cudf_polars/utils/config.py +584 -117
- cudf_polars/utils/dtypes.py +4 -117
- cudf_polars/utils/timer.py +1 -1
- cudf_polars/utils/versions.py +7 -5
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/METADATA +13 -18
- cudf_polars_cu12-25.8.0.dist-info/RECORD +81 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -73
- cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.08.00
|
cudf_polars/callback.py
CHANGED
|
@@ -7,12 +7,14 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
import contextlib
|
|
9
9
|
import os
|
|
10
|
+
import textwrap
|
|
10
11
|
import time
|
|
11
12
|
import warnings
|
|
12
13
|
from functools import cache, partial
|
|
13
14
|
from typing import TYPE_CHECKING, Literal, overload
|
|
14
15
|
|
|
15
16
|
import nvtx
|
|
17
|
+
from typing_extensions import assert_never
|
|
16
18
|
|
|
17
19
|
from polars.exceptions import ComputeError, PerformanceWarning
|
|
18
20
|
|
|
@@ -20,9 +22,10 @@ import pylibcudf
|
|
|
20
22
|
import rmm
|
|
21
23
|
from rmm._cuda import gpu
|
|
22
24
|
|
|
25
|
+
from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
|
|
23
26
|
from cudf_polars.dsl.translate import Translator
|
|
27
|
+
from cudf_polars.utils.config import _env_get_int, get_total_device_memory
|
|
24
28
|
from cudf_polars.utils.timer import Timer
|
|
25
|
-
from cudf_polars.utils.versions import POLARS_VERSION_LT_125
|
|
26
29
|
|
|
27
30
|
if TYPE_CHECKING:
|
|
28
31
|
from collections.abc import Generator
|
|
@@ -32,6 +35,7 @@ if TYPE_CHECKING:
|
|
|
32
35
|
|
|
33
36
|
from cudf_polars.dsl.ir import IR
|
|
34
37
|
from cudf_polars.typing import NodeTraverser
|
|
38
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
35
39
|
|
|
36
40
|
__all__: list[str] = ["execute_with_cudf"]
|
|
37
41
|
|
|
@@ -44,13 +48,6 @@ _SUPPORTED_PREFETCHES = {
|
|
|
44
48
|
}
|
|
45
49
|
|
|
46
50
|
|
|
47
|
-
def _env_get_int(name, default):
|
|
48
|
-
try:
|
|
49
|
-
return int(os.getenv(name, default))
|
|
50
|
-
except (ValueError, TypeError): # pragma: no cover
|
|
51
|
-
return default # pragma: no cover
|
|
52
|
-
|
|
53
|
-
|
|
54
51
|
@cache
|
|
55
52
|
def default_memory_resource(
|
|
56
53
|
device: int,
|
|
@@ -101,8 +98,7 @@ def default_memory_resource(
|
|
|
101
98
|
):
|
|
102
99
|
raise ComputeError(
|
|
103
100
|
"GPU engine requested, but incorrect cudf-polars package installed. "
|
|
104
|
-
"
|
|
105
|
-
"and install `cudf-polars-cu11`"
|
|
101
|
+
"cudf-polars requires CUDA 12.0+ to installed."
|
|
106
102
|
) from None
|
|
107
103
|
else:
|
|
108
104
|
raise
|
|
@@ -139,7 +135,11 @@ def set_memory_resource(
|
|
|
139
135
|
mr = default_memory_resource(
|
|
140
136
|
device=device,
|
|
141
137
|
cuda_managed_memory=bool(
|
|
142
|
-
_env_get_int(
|
|
138
|
+
_env_get_int(
|
|
139
|
+
"POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY",
|
|
140
|
+
default=1 if get_total_device_memory() is not None else 0,
|
|
141
|
+
)
|
|
142
|
+
!= 0
|
|
143
143
|
),
|
|
144
144
|
)
|
|
145
145
|
rmm.mr.set_current_device_resource(mr)
|
|
@@ -184,9 +184,8 @@ def _callback(
|
|
|
184
184
|
n_rows: int | None,
|
|
185
185
|
should_time: Literal[False],
|
|
186
186
|
*,
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
187
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
188
|
+
config_options: ConfigOptions,
|
|
190
189
|
timer: Timer | None,
|
|
191
190
|
) -> pl.DataFrame: ...
|
|
192
191
|
|
|
@@ -199,9 +198,8 @@ def _callback(
|
|
|
199
198
|
n_rows: int | None,
|
|
200
199
|
should_time: Literal[True],
|
|
201
200
|
*,
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
201
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
202
|
+
config_options: ConfigOptions,
|
|
205
203
|
timer: Timer | None,
|
|
206
204
|
) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
|
|
207
205
|
|
|
@@ -213,34 +211,42 @@ def _callback(
|
|
|
213
211
|
n_rows: int | None,
|
|
214
212
|
should_time: bool, # noqa: FBT001
|
|
215
213
|
*,
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
214
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
215
|
+
config_options: ConfigOptions,
|
|
219
216
|
timer: Timer | None,
|
|
220
|
-
):
|
|
217
|
+
) -> pl.DataFrame | tuple[pl.DataFrame, list[tuple[int, int, str]]]:
|
|
221
218
|
assert with_columns is None
|
|
222
219
|
assert pyarrow_predicate is None
|
|
223
220
|
assert n_rows is None
|
|
224
221
|
if timer is not None:
|
|
225
222
|
assert should_time
|
|
226
223
|
with (
|
|
227
|
-
nvtx.annotate(message="ExecuteIR", domain=
|
|
224
|
+
nvtx.annotate(message="ExecuteIR", domain=CUDF_POLARS_NVTX_DOMAIN),
|
|
228
225
|
# Device must be set before memory resource is obtained.
|
|
229
|
-
set_device(device),
|
|
226
|
+
set_device(config_options.device),
|
|
230
227
|
set_memory_resource(memory_resource),
|
|
231
228
|
):
|
|
232
|
-
if executor
|
|
229
|
+
if config_options.executor.name == "in-memory":
|
|
233
230
|
df = ir.evaluate(cache={}, timer=timer).to_polars()
|
|
234
231
|
if timer is None:
|
|
235
232
|
return df
|
|
236
233
|
else:
|
|
237
234
|
return df, timer.timings
|
|
238
|
-
elif executor == "
|
|
239
|
-
from cudf_polars.experimental.parallel import
|
|
235
|
+
elif config_options.executor.name == "streaming":
|
|
236
|
+
from cudf_polars.experimental.parallel import evaluate_streaming
|
|
240
237
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
238
|
+
if timer is not None:
|
|
239
|
+
msg = textwrap.dedent("""\
|
|
240
|
+
LazyFrame.profile() is not supported with the streaming executor.
|
|
241
|
+
To profile execution with the streaming executor, use:
|
|
242
|
+
|
|
243
|
+
- NVIDIA NSight Systems with the 'streaming' scheduler.
|
|
244
|
+
- Dask's built-in profiling tools with the 'distributed' scheduler.
|
|
245
|
+
""")
|
|
246
|
+
raise NotImplementedError(msg)
|
|
247
|
+
|
|
248
|
+
return evaluate_streaming(ir, config_options).to_polars()
|
|
249
|
+
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
244
250
|
|
|
245
251
|
|
|
246
252
|
def execute_with_cudf(
|
|
@@ -259,7 +265,7 @@ def execute_with_cudf(
|
|
|
259
265
|
profiling should occur).
|
|
260
266
|
|
|
261
267
|
config
|
|
262
|
-
GPUEngine
|
|
268
|
+
GPUEngine object. Configuration is available as ``engine.config``.
|
|
263
269
|
|
|
264
270
|
Raises
|
|
265
271
|
------
|
|
@@ -277,16 +283,22 @@ def execute_with_cudf(
|
|
|
277
283
|
else:
|
|
278
284
|
start = time.monotonic_ns()
|
|
279
285
|
timer = Timer(start - duration_since_start)
|
|
280
|
-
|
|
286
|
+
|
|
281
287
|
memory_resource = config.memory_resource
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
|
|
288
|
+
|
|
289
|
+
with nvtx.annotate(message="ConvertIR", domain=CUDF_POLARS_NVTX_DOMAIN):
|
|
285
290
|
translator = Translator(nt, config)
|
|
286
291
|
ir = translator.translate_ir()
|
|
287
292
|
ir_translation_errors = translator.errors
|
|
288
293
|
if timer is not None:
|
|
289
294
|
timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
|
|
295
|
+
|
|
296
|
+
if (
|
|
297
|
+
memory_resource is None
|
|
298
|
+
and translator.config_options.executor.name == "streaming"
|
|
299
|
+
and translator.config_options.executor.scheduler == "distributed"
|
|
300
|
+
): # pragma: no cover; Requires distributed cluster
|
|
301
|
+
memory_resource = rmm.mr.get_current_device_resource()
|
|
290
302
|
if len(ir_translation_errors):
|
|
291
303
|
# TODO: Display these errors in user-friendly way.
|
|
292
304
|
# tracked in https://github.com/rapidsai/cudf/issues/17051
|
|
@@ -301,33 +313,15 @@ def execute_with_cudf(
|
|
|
301
313
|
exception = NotImplementedError(error_message, unique_errors)
|
|
302
314
|
if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
|
|
303
315
|
warnings.warn(error_message, PerformanceWarning, stacklevel=2)
|
|
304
|
-
if raise_on_fail:
|
|
316
|
+
if translator.config_options.raise_on_fail:
|
|
305
317
|
raise exception
|
|
306
318
|
else:
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
memory_resource=memory_resource,
|
|
315
|
-
executor=executor,
|
|
316
|
-
timer=None,
|
|
317
|
-
)
|
|
318
|
-
)
|
|
319
|
-
else:
|
|
320
|
-
nt.set_udf(
|
|
321
|
-
partial(
|
|
322
|
-
_callback,
|
|
323
|
-
ir,
|
|
324
|
-
device=device,
|
|
325
|
-
memory_resource=memory_resource,
|
|
326
|
-
executor=executor,
|
|
327
|
-
timer=timer,
|
|
328
|
-
)
|
|
319
|
+
nt.set_udf(
|
|
320
|
+
partial(
|
|
321
|
+
_callback,
|
|
322
|
+
ir,
|
|
323
|
+
memory_resource=memory_resource,
|
|
324
|
+
config_options=translator.config_options,
|
|
325
|
+
timer=timer,
|
|
329
326
|
)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
if POLARS_VERSION_LT_125: # pragma: no cover
|
|
333
|
-
execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
|
|
327
|
+
)
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""Containers of concrete data."""
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
__all__: list[str] = ["Column", "DataFrame"]
|
|
8
|
+
__all__: list[str] = ["Column", "DataFrame", "DataType"]
|
|
9
9
|
|
|
10
|
+
# dataframe.py & column.py imports DataType, so import in this order to avoid circular import
|
|
11
|
+
from cudf_polars.containers.datatype import DataType # noqa: I001
|
|
10
12
|
from cudf_polars.containers.column import Column
|
|
11
13
|
from cudf_polars.containers.dataframe import DataFrame
|
cudf_polars/containers/column.py
CHANGED
|
@@ -8,6 +8,8 @@ from __future__ import annotations
|
|
|
8
8
|
import functools
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
10
|
|
|
11
|
+
import polars as pl
|
|
12
|
+
import polars.datatypes.convert
|
|
11
13
|
from polars.exceptions import InvalidOperationError
|
|
12
14
|
|
|
13
15
|
import pylibcudf as plc
|
|
@@ -19,19 +21,39 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
19
21
|
)
|
|
20
22
|
from pylibcudf.traits import is_floating_point
|
|
21
23
|
|
|
24
|
+
from cudf_polars.containers import DataType
|
|
22
25
|
from cudf_polars.utils import conversion
|
|
23
26
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
24
27
|
|
|
25
28
|
if TYPE_CHECKING:
|
|
26
29
|
from typing_extensions import Self
|
|
27
30
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
from cudf_polars.typing import (
|
|
32
|
+
ColumnHeader,
|
|
33
|
+
ColumnOptions,
|
|
34
|
+
DeserializedColumnOptions,
|
|
35
|
+
Slice,
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
__all__: list[str] = ["Column"]
|
|
33
39
|
|
|
34
40
|
|
|
41
|
+
def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
|
|
42
|
+
"""Convert a Polars dtype short repr to a Polars dtype."""
|
|
43
|
+
# limitations of dtype_short_repr_to_dtype described in
|
|
44
|
+
# py-polars/polars/datatypes/convert.py#L299
|
|
45
|
+
if dtype_str.startswith("list["):
|
|
46
|
+
stripped = dtype_str.removeprefix("list[").removesuffix("]")
|
|
47
|
+
return pl.List(_dtype_short_repr_to_dtype(stripped))
|
|
48
|
+
pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
|
|
49
|
+
if pl_type is None:
|
|
50
|
+
raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
|
|
51
|
+
if isinstance(pl_type, polars.datatypes.DataTypeClass):
|
|
52
|
+
return pl_type()
|
|
53
|
+
else:
|
|
54
|
+
return pl_type
|
|
55
|
+
|
|
56
|
+
|
|
35
57
|
class Column:
|
|
36
58
|
"""An immutable column with sortedness metadata."""
|
|
37
59
|
|
|
@@ -43,10 +65,12 @@ class Column:
|
|
|
43
65
|
# Optional name, only ever set by evaluation of NamedExpr nodes
|
|
44
66
|
# The internal evaluation should not care about the name.
|
|
45
67
|
name: str | None
|
|
68
|
+
dtype: DataType
|
|
46
69
|
|
|
47
70
|
def __init__(
|
|
48
71
|
self,
|
|
49
72
|
column: plc.Column,
|
|
73
|
+
dtype: DataType,
|
|
50
74
|
*,
|
|
51
75
|
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
|
|
52
76
|
order: plc.types.Order = plc.types.Order.ASCENDING,
|
|
@@ -56,6 +80,7 @@ class Column:
|
|
|
56
80
|
self.obj = column
|
|
57
81
|
self.is_scalar = self.size == 1
|
|
58
82
|
self.name = name
|
|
83
|
+
self.dtype = dtype
|
|
59
84
|
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
|
|
60
85
|
|
|
61
86
|
@classmethod
|
|
@@ -81,7 +106,23 @@ class Column:
|
|
|
81
106
|
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
82
107
|
packed_metadata, packed_gpu_data
|
|
83
108
|
).columns()
|
|
84
|
-
return cls(plc_column, **header["column_kwargs"])
|
|
109
|
+
return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def deserialize_ctor_kwargs(
|
|
113
|
+
column_kwargs: ColumnOptions,
|
|
114
|
+
) -> DeserializedColumnOptions:
|
|
115
|
+
"""Deserialize the constructor kwargs for a Column."""
|
|
116
|
+
dtype = DataType( # pragma: no cover
|
|
117
|
+
_dtype_short_repr_to_dtype(column_kwargs["dtype"])
|
|
118
|
+
)
|
|
119
|
+
return {
|
|
120
|
+
"is_sorted": column_kwargs["is_sorted"],
|
|
121
|
+
"order": column_kwargs["order"],
|
|
122
|
+
"null_order": column_kwargs["null_order"],
|
|
123
|
+
"name": column_kwargs["name"],
|
|
124
|
+
"dtype": dtype,
|
|
125
|
+
}
|
|
85
126
|
|
|
86
127
|
def serialize(
|
|
87
128
|
self,
|
|
@@ -105,17 +146,21 @@ class Column:
|
|
|
105
146
|
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
106
147
|
"""
|
|
107
148
|
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
108
|
-
|
|
149
|
+
header: ColumnHeader = {
|
|
150
|
+
"column_kwargs": self.serialize_ctor_kwargs(),
|
|
151
|
+
"frame_count": 2,
|
|
152
|
+
}
|
|
153
|
+
return header, packed.release()
|
|
154
|
+
|
|
155
|
+
def serialize_ctor_kwargs(self) -> ColumnOptions:
|
|
156
|
+
"""Serialize the constructor kwargs for self."""
|
|
157
|
+
return {
|
|
109
158
|
"is_sorted": self.is_sorted,
|
|
110
159
|
"order": self.order,
|
|
111
160
|
"null_order": self.null_order,
|
|
112
161
|
"name": self.name,
|
|
162
|
+
"dtype": pl.polars.dtype_str_repr(self.dtype.polars),
|
|
113
163
|
}
|
|
114
|
-
header: ColumnHeader = {
|
|
115
|
-
"column_kwargs": column_kwargs,
|
|
116
|
-
"frame_count": 2,
|
|
117
|
-
}
|
|
118
|
-
return header, packed.release()
|
|
119
164
|
|
|
120
165
|
@functools.cached_property
|
|
121
166
|
def obj_scalar(self) -> plc.Scalar:
|
|
@@ -172,12 +217,51 @@ class Column:
|
|
|
172
217
|
return type(self)(
|
|
173
218
|
self.obj,
|
|
174
219
|
name=self.name,
|
|
220
|
+
dtype=self.dtype,
|
|
175
221
|
is_sorted=like.is_sorted,
|
|
176
222
|
order=like.order,
|
|
177
223
|
null_order=like.null_order,
|
|
178
224
|
)
|
|
179
225
|
|
|
180
|
-
def
|
|
226
|
+
def check_sorted(
|
|
227
|
+
self,
|
|
228
|
+
*,
|
|
229
|
+
order: plc.types.Order,
|
|
230
|
+
null_order: plc.types.NullOrder,
|
|
231
|
+
) -> bool:
|
|
232
|
+
"""
|
|
233
|
+
Check if the column is sorted.
|
|
234
|
+
|
|
235
|
+
Parameters
|
|
236
|
+
----------
|
|
237
|
+
order
|
|
238
|
+
The requested sort order.
|
|
239
|
+
null_order
|
|
240
|
+
Where nulls sort to.
|
|
241
|
+
|
|
242
|
+
Returns
|
|
243
|
+
-------
|
|
244
|
+
True if the column is sorted, false otherwise.
|
|
245
|
+
|
|
246
|
+
Notes
|
|
247
|
+
-----
|
|
248
|
+
If the sortedness flag is not set, this launches a kernel to
|
|
249
|
+
check sortedness.
|
|
250
|
+
"""
|
|
251
|
+
if self.size <= 1 or self.size == self.null_count:
|
|
252
|
+
return True
|
|
253
|
+
if self.is_sorted == plc.types.Sorted.YES:
|
|
254
|
+
return self.order == order and (
|
|
255
|
+
self.null_count == 0 or self.null_order == null_order
|
|
256
|
+
)
|
|
257
|
+
if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
|
|
258
|
+
self.sorted = plc.types.Sorted.YES
|
|
259
|
+
self.order = order
|
|
260
|
+
self.null_order = null_order
|
|
261
|
+
return True
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
def astype(self, dtype: DataType) -> Column:
|
|
181
265
|
"""
|
|
182
266
|
Cast the column to as the requested dtype.
|
|
183
267
|
|
|
@@ -200,14 +284,18 @@ class Column:
|
|
|
200
284
|
This only produces a copy if the requested dtype doesn't match
|
|
201
285
|
the current one.
|
|
202
286
|
"""
|
|
203
|
-
|
|
287
|
+
plc_dtype = dtype.plc
|
|
288
|
+
if self.obj.type() == plc_dtype:
|
|
204
289
|
return self
|
|
205
290
|
|
|
206
|
-
if
|
|
207
|
-
|
|
291
|
+
if (
|
|
292
|
+
plc_dtype.id() == plc.TypeId.STRING
|
|
293
|
+
or self.obj.type().id() == plc.TypeId.STRING
|
|
294
|
+
):
|
|
295
|
+
return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
|
|
208
296
|
else:
|
|
209
|
-
result = Column(plc.unary.cast(self.obj, dtype)
|
|
210
|
-
if is_order_preserving_cast(self.obj.type(),
|
|
297
|
+
result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
|
|
298
|
+
if is_order_preserving_cast(self.obj.type(), plc_dtype):
|
|
211
299
|
return result.sorted_like(self)
|
|
212
300
|
return result
|
|
213
301
|
|
|
@@ -220,24 +308,20 @@ class Column:
|
|
|
220
308
|
else:
|
|
221
309
|
if is_floating_point(dtype):
|
|
222
310
|
floats = is_float(self.obj)
|
|
223
|
-
if not plc.
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
).as_py():
|
|
311
|
+
if not plc.reduce.reduce(
|
|
312
|
+
floats,
|
|
313
|
+
plc.aggregation.all(),
|
|
314
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
315
|
+
).to_py():
|
|
230
316
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
231
317
|
return to_floats(self.obj, dtype)
|
|
232
318
|
else:
|
|
233
319
|
integers = is_integer(self.obj)
|
|
234
|
-
if not plc.
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
)
|
|
240
|
-
).as_py():
|
|
320
|
+
if not plc.reduce.reduce(
|
|
321
|
+
integers,
|
|
322
|
+
plc.aggregation.all(),
|
|
323
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
324
|
+
).to_py():
|
|
241
325
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
242
326
|
return to_integers(self.obj, dtype)
|
|
243
327
|
|
|
@@ -323,6 +407,7 @@ class Column:
|
|
|
323
407
|
order=self.order,
|
|
324
408
|
null_order=self.null_order,
|
|
325
409
|
name=self.name,
|
|
410
|
+
dtype=self.dtype,
|
|
326
411
|
)
|
|
327
412
|
|
|
328
413
|
def mask_nans(self) -> Self:
|
|
@@ -330,7 +415,7 @@ class Column:
|
|
|
330
415
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
331
416
|
old_count = self.null_count
|
|
332
417
|
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
333
|
-
result = type(self)(self.obj.with_mask(mask, new_count))
|
|
418
|
+
result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
|
|
334
419
|
if old_count == new_count:
|
|
335
420
|
return result.sorted_like(self)
|
|
336
421
|
return result
|
|
@@ -339,14 +424,12 @@ class Column:
|
|
|
339
424
|
@functools.cached_property
|
|
340
425
|
def nan_count(self) -> int:
|
|
341
426
|
"""Return the number of NaN values in the column."""
|
|
342
|
-
if plc.traits.is_floating_point(self.obj.type()):
|
|
343
|
-
return plc.
|
|
344
|
-
plc.
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
)
|
|
349
|
-
).as_py()
|
|
427
|
+
if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
|
|
428
|
+
return plc.reduce.reduce(
|
|
429
|
+
plc.unary.is_nan(self.obj),
|
|
430
|
+
plc.aggregation.sum(),
|
|
431
|
+
plc.types.SIZE_TYPE,
|
|
432
|
+
).to_py()
|
|
350
433
|
return 0
|
|
351
434
|
|
|
352
435
|
@property
|
|
@@ -380,4 +463,4 @@ class Column:
|
|
|
380
463
|
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
381
464
|
)
|
|
382
465
|
(column,) = table.columns()
|
|
383
|
-
return type(self)(column, name=self.name).sorted_like(self)
|
|
466
|
+
return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
|