cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +35 -50
- cudf_polars/containers/column.py +38 -0
- cudf_polars/containers/dataframe.py +11 -16
- cudf_polars/dsl/expressions/aggregation.py +25 -61
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +3 -39
- cudf_polars/dsl/expressions/boolean.py +21 -49
- cudf_polars/dsl/expressions/datetime.py +59 -17
- cudf_polars/dsl/expressions/literal.py +24 -24
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +6 -24
- cudf_polars/dsl/expressions/slicing.py +2 -8
- cudf_polars/dsl/expressions/sorting.py +4 -17
- cudf_polars/dsl/expressions/string.py +29 -32
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +32 -73
- cudf_polars/dsl/ir.py +575 -167
- cudf_polars/dsl/nodebase.py +1 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +272 -152
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +0 -8
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +109 -167
- cudf_polars/experimental/io.py +53 -26
- cudf_polars/experimental/join.py +59 -24
- cudf_polars/experimental/parallel.py +155 -133
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +109 -9
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +55 -42
- cudf_polars/typing/__init__.py +27 -5
- cudf_polars/utils/config.py +317 -102
- cudf_polars/utils/dtypes.py +8 -1
- cudf_polars/utils/timer.py +1 -1
- cudf_polars/utils/versions.py +4 -4
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +7 -5
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -73
- cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.06.00
|
cudf_polars/callback.py
CHANGED
|
@@ -13,6 +13,7 @@ from functools import cache, partial
|
|
|
13
13
|
from typing import TYPE_CHECKING, Literal, overload
|
|
14
14
|
|
|
15
15
|
import nvtx
|
|
16
|
+
from typing_extensions import assert_never
|
|
16
17
|
|
|
17
18
|
from polars.exceptions import ComputeError, PerformanceWarning
|
|
18
19
|
|
|
@@ -22,7 +23,6 @@ from rmm._cuda import gpu
|
|
|
22
23
|
|
|
23
24
|
from cudf_polars.dsl.translate import Translator
|
|
24
25
|
from cudf_polars.utils.timer import Timer
|
|
25
|
-
from cudf_polars.utils.versions import POLARS_VERSION_LT_125
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
28
|
from collections.abc import Generator
|
|
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
|
|
33
33
|
from cudf_polars.dsl.ir import IR
|
|
34
34
|
from cudf_polars.typing import NodeTraverser
|
|
35
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
35
36
|
|
|
36
37
|
__all__: list[str] = ["execute_with_cudf"]
|
|
37
38
|
|
|
@@ -44,7 +45,7 @@ _SUPPORTED_PREFETCHES = {
|
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
def _env_get_int(name, default):
|
|
48
|
+
def _env_get_int(name: str, default: int) -> int:
|
|
48
49
|
try:
|
|
49
50
|
return int(os.getenv(name, default))
|
|
50
51
|
except (ValueError, TypeError): # pragma: no cover
|
|
@@ -184,9 +185,8 @@ def _callback(
|
|
|
184
185
|
n_rows: int | None,
|
|
185
186
|
should_time: Literal[False],
|
|
186
187
|
*,
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
188
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
189
|
+
config_options: ConfigOptions,
|
|
190
190
|
timer: Timer | None,
|
|
191
191
|
) -> pl.DataFrame: ...
|
|
192
192
|
|
|
@@ -199,9 +199,8 @@ def _callback(
|
|
|
199
199
|
n_rows: int | None,
|
|
200
200
|
should_time: Literal[True],
|
|
201
201
|
*,
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
202
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
203
|
+
config_options: ConfigOptions,
|
|
205
204
|
timer: Timer | None,
|
|
206
205
|
) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
|
|
207
206
|
|
|
@@ -213,11 +212,10 @@ def _callback(
|
|
|
213
212
|
n_rows: int | None,
|
|
214
213
|
should_time: bool, # noqa: FBT001
|
|
215
214
|
*,
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
215
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
216
|
+
config_options: ConfigOptions,
|
|
219
217
|
timer: Timer | None,
|
|
220
|
-
):
|
|
218
|
+
) -> pl.DataFrame | tuple[pl.DataFrame, list[tuple[int, int, str]]]:
|
|
221
219
|
assert with_columns is None
|
|
222
220
|
assert pyarrow_predicate is None
|
|
223
221
|
assert n_rows is None
|
|
@@ -226,21 +224,20 @@ def _callback(
|
|
|
226
224
|
with (
|
|
227
225
|
nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
|
|
228
226
|
# Device must be set before memory resource is obtained.
|
|
229
|
-
set_device(device),
|
|
227
|
+
set_device(config_options.device),
|
|
230
228
|
set_memory_resource(memory_resource),
|
|
231
229
|
):
|
|
232
|
-
if executor
|
|
230
|
+
if config_options.executor.name == "in-memory":
|
|
233
231
|
df = ir.evaluate(cache={}, timer=timer).to_polars()
|
|
234
232
|
if timer is None:
|
|
235
233
|
return df
|
|
236
234
|
else:
|
|
237
235
|
return df, timer.timings
|
|
238
|
-
elif executor == "
|
|
239
|
-
from cudf_polars.experimental.parallel import
|
|
236
|
+
elif config_options.executor.name == "streaming":
|
|
237
|
+
from cudf_polars.experimental.parallel import evaluate_streaming
|
|
240
238
|
|
|
241
|
-
return
|
|
242
|
-
|
|
243
|
-
raise ValueError(f"Unknown executor '{executor}'")
|
|
239
|
+
return evaluate_streaming(ir, config_options).to_polars()
|
|
240
|
+
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
244
241
|
|
|
245
242
|
|
|
246
243
|
def execute_with_cudf(
|
|
@@ -259,7 +256,7 @@ def execute_with_cudf(
|
|
|
259
256
|
profiling should occur).
|
|
260
257
|
|
|
261
258
|
config
|
|
262
|
-
GPUEngine
|
|
259
|
+
GPUEngine object. Configuration is available as ``engine.config``.
|
|
263
260
|
|
|
264
261
|
Raises
|
|
265
262
|
------
|
|
@@ -277,16 +274,22 @@ def execute_with_cudf(
|
|
|
277
274
|
else:
|
|
278
275
|
start = time.monotonic_ns()
|
|
279
276
|
timer = Timer(start - duration_since_start)
|
|
280
|
-
|
|
277
|
+
|
|
281
278
|
memory_resource = config.memory_resource
|
|
282
|
-
|
|
283
|
-
executor = config.config.get("executor", None)
|
|
279
|
+
|
|
284
280
|
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
|
|
285
281
|
translator = Translator(nt, config)
|
|
286
282
|
ir = translator.translate_ir()
|
|
287
283
|
ir_translation_errors = translator.errors
|
|
288
284
|
if timer is not None:
|
|
289
285
|
timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
|
|
286
|
+
|
|
287
|
+
if (
|
|
288
|
+
memory_resource is None
|
|
289
|
+
and translator.config_options.executor.name == "streaming"
|
|
290
|
+
and translator.config_options.executor.scheduler == "distributed"
|
|
291
|
+
): # pragma: no cover; Requires distributed cluster
|
|
292
|
+
memory_resource = rmm.mr.get_current_device_resource()
|
|
290
293
|
if len(ir_translation_errors):
|
|
291
294
|
# TODO: Display these errors in user-friendly way.
|
|
292
295
|
# tracked in https://github.com/rapidsai/cudf/issues/17051
|
|
@@ -301,33 +304,15 @@ def execute_with_cudf(
|
|
|
301
304
|
exception = NotImplementedError(error_message, unique_errors)
|
|
302
305
|
if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
|
|
303
306
|
warnings.warn(error_message, PerformanceWarning, stacklevel=2)
|
|
304
|
-
if raise_on_fail:
|
|
307
|
+
if translator.config_options.raise_on_fail:
|
|
305
308
|
raise exception
|
|
306
309
|
else:
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
memory_resource=memory_resource,
|
|
315
|
-
executor=executor,
|
|
316
|
-
timer=None,
|
|
317
|
-
)
|
|
310
|
+
nt.set_udf(
|
|
311
|
+
partial(
|
|
312
|
+
_callback,
|
|
313
|
+
ir,
|
|
314
|
+
memory_resource=memory_resource,
|
|
315
|
+
config_options=translator.config_options,
|
|
316
|
+
timer=timer,
|
|
318
317
|
)
|
|
319
|
-
|
|
320
|
-
nt.set_udf(
|
|
321
|
-
partial(
|
|
322
|
-
_callback,
|
|
323
|
-
ir,
|
|
324
|
-
device=device,
|
|
325
|
-
memory_resource=memory_resource,
|
|
326
|
-
executor=executor,
|
|
327
|
-
timer=timer,
|
|
328
|
-
)
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
if POLARS_VERSION_LT_125: # pragma: no cover
|
|
333
|
-
execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
|
|
318
|
+
)
|
cudf_polars/containers/column.py
CHANGED
|
@@ -177,6 +177,44 @@ class Column:
|
|
|
177
177
|
null_order=like.null_order,
|
|
178
178
|
)
|
|
179
179
|
|
|
180
|
+
def check_sorted(
|
|
181
|
+
self,
|
|
182
|
+
*,
|
|
183
|
+
order: plc.types.Order,
|
|
184
|
+
null_order: plc.types.NullOrder,
|
|
185
|
+
) -> bool:
|
|
186
|
+
"""
|
|
187
|
+
Check if the column is sorted.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
order
|
|
192
|
+
The requested sort order.
|
|
193
|
+
null_order
|
|
194
|
+
Where nulls sort to.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
True if the column is sorted, false otherwise.
|
|
199
|
+
|
|
200
|
+
Notes
|
|
201
|
+
-----
|
|
202
|
+
If the sortedness flag is not set, this launches a kernel to
|
|
203
|
+
check sortedness.
|
|
204
|
+
"""
|
|
205
|
+
if self.obj.size() <= 1 or self.obj.size() == self.obj.null_count():
|
|
206
|
+
return True
|
|
207
|
+
if self.is_sorted == plc.types.Sorted.YES:
|
|
208
|
+
return self.order == order and (
|
|
209
|
+
self.obj.null_count() == 0 or self.null_order == null_order
|
|
210
|
+
)
|
|
211
|
+
if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
|
|
212
|
+
self.sorted = plc.types.Sorted.YES
|
|
213
|
+
self.order = order
|
|
214
|
+
self.null_order = null_order
|
|
215
|
+
return True
|
|
216
|
+
return False
|
|
217
|
+
|
|
180
218
|
def astype(self, dtype: plc.DataType) -> Column:
|
|
181
219
|
"""
|
|
182
220
|
Cast the column to as the requested dtype.
|
|
@@ -8,19 +8,17 @@ from __future__ import annotations
|
|
|
8
8
|
from functools import cached_property
|
|
9
9
|
from typing import TYPE_CHECKING, cast
|
|
10
10
|
|
|
11
|
-
import pyarrow as pa
|
|
12
|
-
|
|
13
11
|
import polars as pl
|
|
14
12
|
|
|
15
13
|
import pylibcudf as plc
|
|
16
14
|
|
|
17
15
|
from cudf_polars.containers import Column
|
|
18
|
-
from cudf_polars.utils import conversion
|
|
16
|
+
from cudf_polars.utils import conversion
|
|
19
17
|
|
|
20
18
|
if TYPE_CHECKING:
|
|
21
19
|
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
22
20
|
|
|
23
|
-
from typing_extensions import Self
|
|
21
|
+
from typing_extensions import Any, Self
|
|
24
22
|
|
|
25
23
|
from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
|
|
26
24
|
|
|
@@ -108,17 +106,12 @@ class DataFrame:
|
|
|
108
106
|
-------
|
|
109
107
|
New dataframe representing the input.
|
|
110
108
|
"""
|
|
111
|
-
|
|
112
|
-
schema = table.schema
|
|
113
|
-
for i, field in enumerate(schema):
|
|
114
|
-
schema = schema.set(
|
|
115
|
-
i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
|
|
116
|
-
)
|
|
117
|
-
# No-op if the schema is unchanged.
|
|
118
|
-
d_table = plc.interop.from_arrow(table.cast(schema))
|
|
109
|
+
plc_table = plc.Table(df)
|
|
119
110
|
return cls(
|
|
120
|
-
Column(
|
|
121
|
-
for
|
|
111
|
+
Column(d_col, name=name).copy_metadata(h_col)
|
|
112
|
+
for d_col, h_col, name in zip(
|
|
113
|
+
plc_table.columns(), df.iter_columns(), df.columns, strict=True
|
|
114
|
+
)
|
|
122
115
|
)
|
|
123
116
|
|
|
124
117
|
@classmethod
|
|
@@ -246,7 +239,9 @@ class DataFrame:
|
|
|
246
239
|
for c, other in zip(self.columns, like.columns, strict=True)
|
|
247
240
|
)
|
|
248
241
|
|
|
249
|
-
def with_columns(
|
|
242
|
+
def with_columns(
|
|
243
|
+
self, columns: Iterable[Column], *, replace_only: bool = False
|
|
244
|
+
) -> Self:
|
|
250
245
|
"""
|
|
251
246
|
Return a new dataframe with extra columns.
|
|
252
247
|
|
|
@@ -275,7 +270,7 @@ class DataFrame:
|
|
|
275
270
|
"""Drop columns by name."""
|
|
276
271
|
return type(self)(column for column in self.columns if column.name not in names)
|
|
277
272
|
|
|
278
|
-
def select(self, names: Sequence[str]) -> Self:
|
|
273
|
+
def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
|
|
279
274
|
"""Select columns by name returning DataFrame."""
|
|
280
275
|
try:
|
|
281
276
|
return type(self)(self.column_map[name] for name in names)
|
|
@@ -9,22 +9,13 @@ from __future__ import annotations
|
|
|
9
9
|
from functools import partial
|
|
10
10
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
11
|
|
|
12
|
-
import pyarrow as pa
|
|
13
|
-
|
|
14
12
|
import pylibcudf as plc
|
|
15
13
|
|
|
16
14
|
from cudf_polars.containers import Column
|
|
17
|
-
from cudf_polars.dsl.expressions.base import
|
|
18
|
-
AggInfo,
|
|
19
|
-
ExecutionContext,
|
|
20
|
-
Expr,
|
|
21
|
-
)
|
|
15
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
22
16
|
from cudf_polars.dsl.expressions.literal import Literal
|
|
23
|
-
from cudf_polars.dsl.expressions.unary import UnaryFunction
|
|
24
17
|
|
|
25
18
|
if TYPE_CHECKING:
|
|
26
|
-
from collections.abc import Mapping
|
|
27
|
-
|
|
28
19
|
from cudf_polars.containers import DataFrame
|
|
29
20
|
|
|
30
21
|
__all__ = ["Agg"]
|
|
@@ -75,11 +66,15 @@ class Agg(Expr):
|
|
|
75
66
|
else plc.types.NullPolicy.INCLUDE
|
|
76
67
|
)
|
|
77
68
|
elif name == "quantile":
|
|
78
|
-
|
|
69
|
+
child, quantile = self.children
|
|
79
70
|
if not isinstance(quantile, Literal):
|
|
80
71
|
raise NotImplementedError("Only support literal quantile values")
|
|
72
|
+
if options == "equiprobable":
|
|
73
|
+
raise NotImplementedError("Quantile with equiprobable interpolation")
|
|
74
|
+
if plc.traits.is_duration(child.dtype):
|
|
75
|
+
raise NotImplementedError("Quantile with duration data type")
|
|
81
76
|
req = plc.aggregation.quantile(
|
|
82
|
-
quantiles=[quantile.value
|
|
77
|
+
quantiles=[quantile.value], interp=Agg.interp_mapping[options]
|
|
83
78
|
)
|
|
84
79
|
else:
|
|
85
80
|
raise NotImplementedError(
|
|
@@ -91,7 +86,9 @@ class Agg(Expr):
|
|
|
91
86
|
op = partial(self._reduce, request=req)
|
|
92
87
|
elif name in {"min", "max"}:
|
|
93
88
|
op = partial(op, propagate_nans=options)
|
|
94
|
-
elif name
|
|
89
|
+
elif name == "count":
|
|
90
|
+
op = partial(op, include_nulls=options)
|
|
91
|
+
elif name in {"sum", "first", "last"}:
|
|
95
92
|
pass
|
|
96
93
|
else:
|
|
97
94
|
raise NotImplementedError(
|
|
@@ -124,38 +121,19 @@ class Agg(Expr):
|
|
|
124
121
|
"linear": plc.types.Interpolation.LINEAR,
|
|
125
122
|
}
|
|
126
123
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
if depth >= 1:
|
|
130
|
-
raise NotImplementedError(
|
|
131
|
-
"Nested aggregations in groupby"
|
|
132
|
-
) # pragma: no cover; check_agg trips first
|
|
133
|
-
if (isminmax := self.name in {"min", "max"}) and self.options:
|
|
134
|
-
raise NotImplementedError("Nan propagation in groupby for min/max")
|
|
135
|
-
(child,) = self.children
|
|
136
|
-
((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
|
|
137
|
-
request = self.request
|
|
138
|
-
# These are handled specially here because we don't set up the
|
|
139
|
-
# request for the whole-frame agg because we can avoid a
|
|
140
|
-
# reduce for these.
|
|
124
|
+
@property
|
|
125
|
+
def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102
|
|
141
126
|
if self.name == "first":
|
|
142
|
-
|
|
127
|
+
return plc.aggregation.nth_element(
|
|
143
128
|
0, null_handling=plc.types.NullPolicy.INCLUDE
|
|
144
129
|
)
|
|
145
130
|
elif self.name == "last":
|
|
146
|
-
|
|
131
|
+
return plc.aggregation.nth_element(
|
|
147
132
|
-1, null_handling=plc.types.NullPolicy.INCLUDE
|
|
148
133
|
)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
) # pragma: no cover; __init__ trips first
|
|
153
|
-
if isminmax and plc.traits.is_floating_point(self.dtype):
|
|
154
|
-
assert expr is not None
|
|
155
|
-
# Ignore nans in these groupby aggs, do this by masking
|
|
156
|
-
# nans in the input
|
|
157
|
-
expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
|
|
158
|
-
return AggInfo([(expr, request, self)])
|
|
134
|
+
else:
|
|
135
|
+
assert self.request is not None, "Init should have raised"
|
|
136
|
+
return self.request
|
|
159
137
|
|
|
160
138
|
def _reduce(
|
|
161
139
|
self, column: Column, *, request: plc.aggregation.Aggregation
|
|
@@ -167,15 +145,11 @@ class Agg(Expr):
|
|
|
167
145
|
)
|
|
168
146
|
)
|
|
169
147
|
|
|
170
|
-
def _count(self, column: Column) -> Column:
|
|
148
|
+
def _count(self, column: Column, *, include_nulls: bool) -> Column:
|
|
149
|
+
null_count = column.null_count if not include_nulls else 0
|
|
171
150
|
return Column(
|
|
172
151
|
plc.Column.from_scalar(
|
|
173
|
-
plc.
|
|
174
|
-
pa.scalar(
|
|
175
|
-
column.size - column.null_count,
|
|
176
|
-
type=plc.interop.to_arrow(self.dtype),
|
|
177
|
-
),
|
|
178
|
-
),
|
|
152
|
+
plc.Scalar.from_py(column.size - null_count, self.dtype),
|
|
179
153
|
1,
|
|
180
154
|
)
|
|
181
155
|
)
|
|
@@ -184,9 +158,7 @@ class Agg(Expr):
|
|
|
184
158
|
if column.size == 0 or column.null_count == column.size:
|
|
185
159
|
return Column(
|
|
186
160
|
plc.Column.from_scalar(
|
|
187
|
-
plc.
|
|
188
|
-
pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
|
|
189
|
-
),
|
|
161
|
+
plc.Scalar.from_py(0, self.dtype),
|
|
190
162
|
1,
|
|
191
163
|
)
|
|
192
164
|
)
|
|
@@ -196,9 +168,7 @@ class Agg(Expr):
|
|
|
196
168
|
if propagate_nans and column.nan_count > 0:
|
|
197
169
|
return Column(
|
|
198
170
|
plc.Column.from_scalar(
|
|
199
|
-
plc.
|
|
200
|
-
pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
|
|
201
|
-
),
|
|
171
|
+
plc.Scalar.from_py(float("nan"), self.dtype),
|
|
202
172
|
1,
|
|
203
173
|
)
|
|
204
174
|
)
|
|
@@ -210,9 +180,7 @@ class Agg(Expr):
|
|
|
210
180
|
if propagate_nans and column.nan_count > 0:
|
|
211
181
|
return Column(
|
|
212
182
|
plc.Column.from_scalar(
|
|
213
|
-
plc.
|
|
214
|
-
pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
|
|
215
|
-
),
|
|
183
|
+
plc.Scalar.from_py(float("nan"), self.dtype),
|
|
216
184
|
1,
|
|
217
185
|
)
|
|
218
186
|
)
|
|
@@ -228,11 +196,7 @@ class Agg(Expr):
|
|
|
228
196
|
return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
|
|
229
197
|
|
|
230
198
|
def do_evaluate(
|
|
231
|
-
self,
|
|
232
|
-
df: DataFrame,
|
|
233
|
-
*,
|
|
234
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
235
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
199
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
236
200
|
) -> Column:
|
|
237
201
|
"""Evaluate this expression given a dataframe for context."""
|
|
238
202
|
if context is not ExecutionContext.FRAME:
|
|
@@ -243,4 +207,4 @@ class Agg(Expr):
|
|
|
243
207
|
# Aggregations like quantiles may have additional children that were
|
|
244
208
|
# preprocessed into pylibcudf requests.
|
|
245
209
|
child = self.children[0]
|
|
246
|
-
return self.op(child.evaluate(df, context=context
|
|
210
|
+
return self.op(child.evaluate(df, context=context))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -16,7 +16,7 @@ from cudf_polars.containers import Column
|
|
|
16
16
|
from cudf_polars.dsl.nodebase import Node
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from
|
|
19
|
+
from typing_extensions import Self
|
|
20
20
|
|
|
21
21
|
from cudf_polars.containers import Column, DataFrame
|
|
22
22
|
|
|
@@ -46,11 +46,7 @@ class Expr(Node["Expr"]):
|
|
|
46
46
|
"""Names of non-child data (not Exprs) for reconstruction."""
|
|
47
47
|
|
|
48
48
|
def do_evaluate(
|
|
49
|
-
self,
|
|
50
|
-
df: DataFrame,
|
|
51
|
-
*,
|
|
52
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
53
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
49
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
54
50
|
) -> Column:
|
|
55
51
|
"""
|
|
56
52
|
Evaluate this expression given a dataframe for context.
|
|
@@ -61,15 +57,10 @@ class Expr(Node["Expr"]):
|
|
|
61
57
|
DataFrame that will provide columns.
|
|
62
58
|
context
|
|
63
59
|
What context are we performing this evaluation in?
|
|
64
|
-
mapping
|
|
65
|
-
Substitution mapping from expressions to Columns, used to
|
|
66
|
-
override the evaluation of a given expression if we're
|
|
67
|
-
performing a simple rewritten evaluation.
|
|
68
60
|
|
|
69
61
|
Notes
|
|
70
62
|
-----
|
|
71
|
-
Do not call this function directly, but rather
|
|
72
|
-
:meth:`evaluate` which handles the mapping lookups.
|
|
63
|
+
Do not call this function directly, but rather :meth:`evaluate`.
|
|
73
64
|
|
|
74
65
|
Returns
|
|
75
66
|
-------
|
|
@@ -87,11 +78,7 @@ class Expr(Node["Expr"]):
|
|
|
87
78
|
) # pragma: no cover; translation of unimplemented nodes trips first
|
|
88
79
|
|
|
89
80
|
def evaluate(
|
|
90
|
-
self,
|
|
91
|
-
df: DataFrame,
|
|
92
|
-
*,
|
|
93
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
94
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
81
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
95
82
|
) -> Column:
|
|
96
83
|
"""
|
|
97
84
|
Evaluate this expression given a dataframe for context.
|
|
@@ -102,10 +89,6 @@ class Expr(Node["Expr"]):
|
|
|
102
89
|
DataFrame that will provide columns.
|
|
103
90
|
context
|
|
104
91
|
What context are we performing this evaluation in?
|
|
105
|
-
mapping
|
|
106
|
-
Substitution mapping from expressions to Columns, used to
|
|
107
|
-
override the evaluation of a given expression if we're
|
|
108
|
-
performing a simple rewritten evaluation.
|
|
109
92
|
|
|
110
93
|
Notes
|
|
111
94
|
-----
|
|
@@ -124,37 +107,28 @@ class Expr(Node["Expr"]):
|
|
|
124
107
|
are returned during translation to the IR, but for now we
|
|
125
108
|
are not perfect.
|
|
126
109
|
"""
|
|
127
|
-
|
|
128
|
-
return self.do_evaluate(df, context=context, mapping=mapping)
|
|
129
|
-
try:
|
|
130
|
-
return mapping[self]
|
|
131
|
-
except KeyError:
|
|
132
|
-
return self.do_evaluate(df, context=context, mapping=mapping)
|
|
133
|
-
|
|
134
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
135
|
-
"""
|
|
136
|
-
Collect information about aggregations in groupbys.
|
|
110
|
+
return self.do_evaluate(df, context=context)
|
|
137
111
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
expressions we are currently at.
|
|
112
|
+
@property
|
|
113
|
+
def agg_request(self) -> plc.aggregation.Aggregation:
|
|
114
|
+
"""
|
|
115
|
+
The aggregation for this expression in a grouped aggregation.
|
|
143
116
|
|
|
144
117
|
Returns
|
|
145
118
|
-------
|
|
146
|
-
Aggregation
|
|
147
|
-
|
|
119
|
+
Aggregation request. Default is to collect the expression.
|
|
120
|
+
|
|
121
|
+
Notes
|
|
122
|
+
-----
|
|
123
|
+
This presumes that the IR translation has decomposed groupby
|
|
124
|
+
reductions only into cases we can handle.
|
|
148
125
|
|
|
149
126
|
Raises
|
|
150
127
|
------
|
|
151
128
|
NotImplementedError
|
|
152
|
-
If
|
|
153
|
-
example nested aggregations like ``a.max().min()``.
|
|
129
|
+
If requesting an aggregation from an unexpected expression.
|
|
154
130
|
"""
|
|
155
|
-
|
|
156
|
-
f"Collecting aggregation info for {type(self).__name__}"
|
|
157
|
-
) # pragma: no cover; check_agg trips first
|
|
131
|
+
return plc.aggregation.collect_list()
|
|
158
132
|
|
|
159
133
|
|
|
160
134
|
class ErrorExpr(Expr):
|
|
@@ -166,7 +140,7 @@ class ErrorExpr(Expr):
|
|
|
166
140
|
self.dtype = dtype
|
|
167
141
|
self.error = error
|
|
168
142
|
self.children = ()
|
|
169
|
-
self.is_pointwise =
|
|
143
|
+
self.is_pointwise = False
|
|
170
144
|
|
|
171
145
|
|
|
172
146
|
class NamedExpr:
|
|
@@ -202,11 +176,7 @@ class NamedExpr:
|
|
|
202
176
|
return not self.__eq__(other)
|
|
203
177
|
|
|
204
178
|
def evaluate(
|
|
205
|
-
self,
|
|
206
|
-
df: DataFrame,
|
|
207
|
-
*,
|
|
208
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
209
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
179
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
210
180
|
) -> Column:
|
|
211
181
|
"""
|
|
212
182
|
Evaluate this expression given a dataframe for context.
|
|
@@ -217,8 +187,6 @@ class NamedExpr:
|
|
|
217
187
|
DataFrame providing context
|
|
218
188
|
context
|
|
219
189
|
Execution context
|
|
220
|
-
mapping
|
|
221
|
-
Substitution mapping
|
|
222
190
|
|
|
223
191
|
Returns
|
|
224
192
|
-------
|
|
@@ -229,13 +197,25 @@ class NamedExpr:
|
|
|
229
197
|
:meth:`Expr.evaluate` for details, this function just adds the
|
|
230
198
|
name to a column produced from an expression.
|
|
231
199
|
"""
|
|
232
|
-
return self.value.evaluate(df, context=context
|
|
233
|
-
|
|
234
|
-
|
|
200
|
+
return self.value.evaluate(df, context=context).rename(self.name)
|
|
201
|
+
|
|
202
|
+
def reconstruct(self, expr: Expr) -> Self:
|
|
203
|
+
"""
|
|
204
|
+
Rebuild with a new `Expr` value.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
expr
|
|
209
|
+
New `Expr` value
|
|
235
210
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
New `NamedExpr` with `expr` as the underlying expression.
|
|
214
|
+
The name of the original `NamedExpr` is preserved.
|
|
215
|
+
"""
|
|
216
|
+
if expr is self.value:
|
|
217
|
+
return self
|
|
218
|
+
return type(self)(self.name, expr)
|
|
239
219
|
|
|
240
220
|
|
|
241
221
|
class Col(Expr):
|
|
@@ -250,21 +230,13 @@ class Col(Expr):
|
|
|
250
230
|
self.children = ()
|
|
251
231
|
|
|
252
232
|
def do_evaluate(
|
|
253
|
-
self,
|
|
254
|
-
df: DataFrame,
|
|
255
|
-
*,
|
|
256
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
257
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
233
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
258
234
|
) -> Column:
|
|
259
235
|
"""Evaluate this expression given a dataframe for context."""
|
|
260
236
|
# Deliberately remove the name here so that we guarantee
|
|
261
237
|
# evaluation of the IR produces names.
|
|
262
238
|
return df.column_map[self.name].rename(None)
|
|
263
239
|
|
|
264
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
265
|
-
"""Collect information about aggregations in groupbys."""
|
|
266
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
267
|
-
|
|
268
240
|
|
|
269
241
|
class ColRef(Expr):
|
|
270
242
|
__slots__ = ("index", "table_ref")
|
|
@@ -288,11 +260,7 @@ class ColRef(Expr):
|
|
|
288
260
|
self.children = (column,)
|
|
289
261
|
|
|
290
262
|
def do_evaluate(
|
|
291
|
-
self,
|
|
292
|
-
df: DataFrame,
|
|
293
|
-
*,
|
|
294
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
295
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
263
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
296
264
|
) -> Column:
|
|
297
265
|
"""Evaluate this expression given a dataframe for context."""
|
|
298
266
|
raise NotImplementedError(
|