cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +35 -50
  3. cudf_polars/containers/column.py +38 -0
  4. cudf_polars/containers/dataframe.py +11 -16
  5. cudf_polars/dsl/expressions/aggregation.py +25 -61
  6. cudf_polars/dsl/expressions/base.py +40 -72
  7. cudf_polars/dsl/expressions/binaryop.py +3 -39
  8. cudf_polars/dsl/expressions/boolean.py +21 -49
  9. cudf_polars/dsl/expressions/datetime.py +59 -17
  10. cudf_polars/dsl/expressions/literal.py +24 -24
  11. cudf_polars/dsl/expressions/rolling.py +110 -9
  12. cudf_polars/dsl/expressions/selection.py +6 -24
  13. cudf_polars/dsl/expressions/slicing.py +2 -8
  14. cudf_polars/dsl/expressions/sorting.py +4 -17
  15. cudf_polars/dsl/expressions/string.py +29 -32
  16. cudf_polars/dsl/expressions/ternary.py +3 -10
  17. cudf_polars/dsl/expressions/unary.py +32 -73
  18. cudf_polars/dsl/ir.py +575 -167
  19. cudf_polars/dsl/nodebase.py +1 -1
  20. cudf_polars/dsl/to_ast.py +5 -3
  21. cudf_polars/dsl/translate.py +272 -152
  22. cudf_polars/dsl/utils/__init__.py +8 -0
  23. cudf_polars/dsl/utils/aggregations.py +292 -0
  24. cudf_polars/dsl/utils/groupby.py +97 -0
  25. cudf_polars/dsl/utils/naming.py +34 -0
  26. cudf_polars/dsl/utils/replace.py +46 -0
  27. cudf_polars/dsl/utils/rolling.py +113 -0
  28. cudf_polars/dsl/utils/windows.py +186 -0
  29. cudf_polars/experimental/base.py +0 -8
  30. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  31. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  32. cudf_polars/experimental/dask_registers.py +196 -0
  33. cudf_polars/experimental/distinct.py +174 -0
  34. cudf_polars/experimental/explain.py +127 -0
  35. cudf_polars/experimental/expressions.py +521 -0
  36. cudf_polars/experimental/groupby.py +109 -167
  37. cudf_polars/experimental/io.py +53 -26
  38. cudf_polars/experimental/join.py +59 -24
  39. cudf_polars/experimental/parallel.py +155 -133
  40. cudf_polars/experimental/repartition.py +69 -0
  41. cudf_polars/experimental/scheduler.py +155 -0
  42. cudf_polars/experimental/select.py +92 -7
  43. cudf_polars/experimental/shuffle.py +109 -9
  44. cudf_polars/experimental/sort.py +45 -0
  45. cudf_polars/experimental/spilling.py +151 -0
  46. cudf_polars/experimental/utils.py +100 -0
  47. cudf_polars/testing/asserts.py +146 -6
  48. cudf_polars/testing/io.py +72 -0
  49. cudf_polars/testing/plugin.py +55 -42
  50. cudf_polars/typing/__init__.py +27 -5
  51. cudf_polars/utils/config.py +317 -102
  52. cudf_polars/utils/dtypes.py +8 -1
  53. cudf_polars/utils/timer.py +1 -1
  54. cudf_polars/utils/versions.py +4 -4
  55. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +7 -5
  56. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  57. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  58. cudf_polars/experimental/dask_serialize.py +0 -73
  59. cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
  60. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/licenses/LICENSE +0 -0
  61. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION CHANGED
@@ -1 +1 @@
1
- 25.04.00
1
+ 25.06.00
cudf_polars/callback.py CHANGED
@@ -13,6 +13,7 @@ from functools import cache, partial
13
13
  from typing import TYPE_CHECKING, Literal, overload
14
14
 
15
15
  import nvtx
16
+ from typing_extensions import assert_never
16
17
 
17
18
  from polars.exceptions import ComputeError, PerformanceWarning
18
19
 
@@ -22,7 +23,6 @@ from rmm._cuda import gpu
22
23
 
23
24
  from cudf_polars.dsl.translate import Translator
24
25
  from cudf_polars.utils.timer import Timer
25
- from cudf_polars.utils.versions import POLARS_VERSION_LT_125
26
26
 
27
27
  if TYPE_CHECKING:
28
28
  from collections.abc import Generator
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
32
32
 
33
33
  from cudf_polars.dsl.ir import IR
34
34
  from cudf_polars.typing import NodeTraverser
35
+ from cudf_polars.utils.config import ConfigOptions
35
36
 
36
37
  __all__: list[str] = ["execute_with_cudf"]
37
38
 
@@ -44,7 +45,7 @@ _SUPPORTED_PREFETCHES = {
44
45
  }
45
46
 
46
47
 
47
- def _env_get_int(name, default):
48
+ def _env_get_int(name: str, default: int) -> int:
48
49
  try:
49
50
  return int(os.getenv(name, default))
50
51
  except (ValueError, TypeError): # pragma: no cover
@@ -184,9 +185,8 @@ def _callback(
184
185
  n_rows: int | None,
185
186
  should_time: Literal[False],
186
187
  *,
187
- device: int | None,
188
- memory_resource: int | None,
189
- executor: Literal["pylibcudf", "dask-experimental"] | None,
188
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
189
+ config_options: ConfigOptions,
190
190
  timer: Timer | None,
191
191
  ) -> pl.DataFrame: ...
192
192
 
@@ -199,9 +199,8 @@ def _callback(
199
199
  n_rows: int | None,
200
200
  should_time: Literal[True],
201
201
  *,
202
- device: int | None,
203
- memory_resource: int | None,
204
- executor: Literal["pylibcudf", "dask-experimental"] | None,
202
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
203
+ config_options: ConfigOptions,
205
204
  timer: Timer | None,
206
205
  ) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
207
206
 
@@ -213,11 +212,10 @@ def _callback(
213
212
  n_rows: int | None,
214
213
  should_time: bool, # noqa: FBT001
215
214
  *,
216
- device: int | None,
217
- memory_resource: int | None,
218
- executor: Literal["pylibcudf", "dask-experimental"] | None,
215
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
216
+ config_options: ConfigOptions,
219
217
  timer: Timer | None,
220
- ):
218
+ ) -> pl.DataFrame | tuple[pl.DataFrame, list[tuple[int, int, str]]]:
221
219
  assert with_columns is None
222
220
  assert pyarrow_predicate is None
223
221
  assert n_rows is None
@@ -226,21 +224,20 @@ def _callback(
226
224
  with (
227
225
  nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
228
226
  # Device must be set before memory resource is obtained.
229
- set_device(device),
227
+ set_device(config_options.device),
230
228
  set_memory_resource(memory_resource),
231
229
  ):
232
- if executor is None or executor == "pylibcudf":
230
+ if config_options.executor.name == "in-memory":
233
231
  df = ir.evaluate(cache={}, timer=timer).to_polars()
234
232
  if timer is None:
235
233
  return df
236
234
  else:
237
235
  return df, timer.timings
238
- elif executor == "dask-experimental":
239
- from cudf_polars.experimental.parallel import evaluate_dask
236
+ elif config_options.executor.name == "streaming":
237
+ from cudf_polars.experimental.parallel import evaluate_streaming
240
238
 
241
- return evaluate_dask(ir).to_polars()
242
- else:
243
- raise ValueError(f"Unknown executor '{executor}'")
239
+ return evaluate_streaming(ir, config_options).to_polars()
240
+ assert_never(f"Unknown executor '{config_options.executor}'")
244
241
 
245
242
 
246
243
  def execute_with_cudf(
@@ -259,7 +256,7 @@ def execute_with_cudf(
259
256
  profiling should occur).
260
257
 
261
258
  config
262
- GPUEngine configuration object
259
+ GPUEngine object. Configuration is available as ``engine.config``.
263
260
 
264
261
  Raises
265
262
  ------
@@ -277,16 +274,22 @@ def execute_with_cudf(
277
274
  else:
278
275
  start = time.monotonic_ns()
279
276
  timer = Timer(start - duration_since_start)
280
- device = config.device
277
+
281
278
  memory_resource = config.memory_resource
282
- raise_on_fail = config.config.get("raise_on_fail", False)
283
- executor = config.config.get("executor", None)
279
+
284
280
  with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
285
281
  translator = Translator(nt, config)
286
282
  ir = translator.translate_ir()
287
283
  ir_translation_errors = translator.errors
288
284
  if timer is not None:
289
285
  timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
286
+
287
+ if (
288
+ memory_resource is None
289
+ and translator.config_options.executor.name == "streaming"
290
+ and translator.config_options.executor.scheduler == "distributed"
291
+ ): # pragma: no cover; Requires distributed cluster
292
+ memory_resource = rmm.mr.get_current_device_resource()
290
293
  if len(ir_translation_errors):
291
294
  # TODO: Display these errors in user-friendly way.
292
295
  # tracked in https://github.com/rapidsai/cudf/issues/17051
@@ -301,33 +304,15 @@ def execute_with_cudf(
301
304
  exception = NotImplementedError(error_message, unique_errors)
302
305
  if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
303
306
  warnings.warn(error_message, PerformanceWarning, stacklevel=2)
304
- if raise_on_fail:
307
+ if translator.config_options.raise_on_fail:
305
308
  raise exception
306
309
  else:
307
- if POLARS_VERSION_LT_125: # pragma: no cover
308
- nt.set_udf(
309
- partial(
310
- _callback,
311
- ir,
312
- should_time=False,
313
- device=device,
314
- memory_resource=memory_resource,
315
- executor=executor,
316
- timer=None,
317
- )
310
+ nt.set_udf(
311
+ partial(
312
+ _callback,
313
+ ir,
314
+ memory_resource=memory_resource,
315
+ config_options=translator.config_options,
316
+ timer=timer,
318
317
  )
319
- else:
320
- nt.set_udf(
321
- partial(
322
- _callback,
323
- ir,
324
- device=device,
325
- memory_resource=memory_resource,
326
- executor=executor,
327
- timer=timer,
328
- )
329
- )
330
-
331
-
332
- if POLARS_VERSION_LT_125: # pragma: no cover
333
- execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
318
+ )
@@ -177,6 +177,44 @@ class Column:
177
177
  null_order=like.null_order,
178
178
  )
179
179
 
180
+ def check_sorted(
181
+ self,
182
+ *,
183
+ order: plc.types.Order,
184
+ null_order: plc.types.NullOrder,
185
+ ) -> bool:
186
+ """
187
+ Check if the column is sorted.
188
+
189
+ Parameters
190
+ ----------
191
+ order
192
+ The requested sort order.
193
+ null_order
194
+ Where nulls sort to.
195
+
196
+ Returns
197
+ -------
198
+ True if the column is sorted, false otherwise.
199
+
200
+ Notes
201
+ -----
202
+ If the sortedness flag is not set, this launches a kernel to
203
+ check sortedness.
204
+ """
205
+ if self.obj.size() <= 1 or self.obj.size() == self.obj.null_count():
206
+ return True
207
+ if self.is_sorted == plc.types.Sorted.YES:
208
+ return self.order == order and (
209
+ self.obj.null_count() == 0 or self.null_order == null_order
210
+ )
211
+ if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
212
+ self.sorted = plc.types.Sorted.YES
213
+ self.order = order
214
+ self.null_order = null_order
215
+ return True
216
+ return False
217
+
180
218
  def astype(self, dtype: plc.DataType) -> Column:
181
219
  """
182
220
  Cast the column to as the requested dtype.
@@ -8,19 +8,17 @@ from __future__ import annotations
8
8
  from functools import cached_property
9
9
  from typing import TYPE_CHECKING, cast
10
10
 
11
- import pyarrow as pa
12
-
13
11
  import polars as pl
14
12
 
15
13
  import pylibcudf as plc
16
14
 
17
15
  from cudf_polars.containers import Column
18
- from cudf_polars.utils import conversion, dtypes
16
+ from cudf_polars.utils import conversion
19
17
 
20
18
  if TYPE_CHECKING:
21
19
  from collections.abc import Iterable, Mapping, Sequence, Set
22
20
 
23
- from typing_extensions import Self
21
+ from typing_extensions import Any, Self
24
22
 
25
23
  from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
26
24
 
@@ -108,17 +106,12 @@ class DataFrame:
108
106
  -------
109
107
  New dataframe representing the input.
110
108
  """
111
- table = df.to_arrow()
112
- schema = table.schema
113
- for i, field in enumerate(schema):
114
- schema = schema.set(
115
- i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
116
- )
117
- # No-op if the schema is unchanged.
118
- d_table = plc.interop.from_arrow(table.cast(schema))
109
+ plc_table = plc.Table(df)
119
110
  return cls(
120
- Column(column).copy_metadata(h_col)
121
- for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
111
+ Column(d_col, name=name).copy_metadata(h_col)
112
+ for d_col, h_col, name in zip(
113
+ plc_table.columns(), df.iter_columns(), df.columns, strict=True
114
+ )
122
115
  )
123
116
 
124
117
  @classmethod
@@ -246,7 +239,9 @@ class DataFrame:
246
239
  for c, other in zip(self.columns, like.columns, strict=True)
247
240
  )
248
241
 
249
- def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
242
+ def with_columns(
243
+ self, columns: Iterable[Column], *, replace_only: bool = False
244
+ ) -> Self:
250
245
  """
251
246
  Return a new dataframe with extra columns.
252
247
 
@@ -275,7 +270,7 @@ class DataFrame:
275
270
  """Drop columns by name."""
276
271
  return type(self)(column for column in self.columns if column.name not in names)
277
272
 
278
- def select(self, names: Sequence[str]) -> Self:
273
+ def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
279
274
  """Select columns by name returning DataFrame."""
280
275
  try:
281
276
  return type(self)(self.column_map[name] for name in names)
@@ -9,22 +9,13 @@ from __future__ import annotations
9
9
  from functools import partial
10
10
  from typing import TYPE_CHECKING, Any, ClassVar
11
11
 
12
- import pyarrow as pa
13
-
14
12
  import pylibcudf as plc
15
13
 
16
14
  from cudf_polars.containers import Column
17
- from cudf_polars.dsl.expressions.base import (
18
- AggInfo,
19
- ExecutionContext,
20
- Expr,
21
- )
15
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
22
16
  from cudf_polars.dsl.expressions.literal import Literal
23
- from cudf_polars.dsl.expressions.unary import UnaryFunction
24
17
 
25
18
  if TYPE_CHECKING:
26
- from collections.abc import Mapping
27
-
28
19
  from cudf_polars.containers import DataFrame
29
20
 
30
21
  __all__ = ["Agg"]
@@ -75,11 +66,15 @@ class Agg(Expr):
75
66
  else plc.types.NullPolicy.INCLUDE
76
67
  )
77
68
  elif name == "quantile":
78
- _, quantile = self.children
69
+ child, quantile = self.children
79
70
  if not isinstance(quantile, Literal):
80
71
  raise NotImplementedError("Only support literal quantile values")
72
+ if options == "equiprobable":
73
+ raise NotImplementedError("Quantile with equiprobable interpolation")
74
+ if plc.traits.is_duration(child.dtype):
75
+ raise NotImplementedError("Quantile with duration data type")
81
76
  req = plc.aggregation.quantile(
82
- quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
77
+ quantiles=[quantile.value], interp=Agg.interp_mapping[options]
83
78
  )
84
79
  else:
85
80
  raise NotImplementedError(
@@ -91,7 +86,9 @@ class Agg(Expr):
91
86
  op = partial(self._reduce, request=req)
92
87
  elif name in {"min", "max"}:
93
88
  op = partial(op, propagate_nans=options)
94
- elif name in {"count", "sum", "first", "last"}:
89
+ elif name == "count":
90
+ op = partial(op, include_nulls=options)
91
+ elif name in {"sum", "first", "last"}:
95
92
  pass
96
93
  else:
97
94
  raise NotImplementedError(
@@ -124,38 +121,19 @@ class Agg(Expr):
124
121
  "linear": plc.types.Interpolation.LINEAR,
125
122
  }
126
123
 
127
- def collect_agg(self, *, depth: int) -> AggInfo:
128
- """Collect information about aggregations in groupbys."""
129
- if depth >= 1:
130
- raise NotImplementedError(
131
- "Nested aggregations in groupby"
132
- ) # pragma: no cover; check_agg trips first
133
- if (isminmax := self.name in {"min", "max"}) and self.options:
134
- raise NotImplementedError("Nan propagation in groupby for min/max")
135
- (child,) = self.children
136
- ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
137
- request = self.request
138
- # These are handled specially here because we don't set up the
139
- # request for the whole-frame agg because we can avoid a
140
- # reduce for these.
124
+ @property
125
+ def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102
141
126
  if self.name == "first":
142
- request = plc.aggregation.nth_element(
127
+ return plc.aggregation.nth_element(
143
128
  0, null_handling=plc.types.NullPolicy.INCLUDE
144
129
  )
145
130
  elif self.name == "last":
146
- request = plc.aggregation.nth_element(
131
+ return plc.aggregation.nth_element(
147
132
  -1, null_handling=plc.types.NullPolicy.INCLUDE
148
133
  )
149
- if request is None:
150
- raise NotImplementedError(
151
- f"Aggregation {self.name} in groupby"
152
- ) # pragma: no cover; __init__ trips first
153
- if isminmax and plc.traits.is_floating_point(self.dtype):
154
- assert expr is not None
155
- # Ignore nans in these groupby aggs, do this by masking
156
- # nans in the input
157
- expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
158
- return AggInfo([(expr, request, self)])
134
+ else:
135
+ assert self.request is not None, "Init should have raised"
136
+ return self.request
159
137
 
160
138
  def _reduce(
161
139
  self, column: Column, *, request: plc.aggregation.Aggregation
@@ -167,15 +145,11 @@ class Agg(Expr):
167
145
  )
168
146
  )
169
147
 
170
- def _count(self, column: Column) -> Column:
148
+ def _count(self, column: Column, *, include_nulls: bool) -> Column:
149
+ null_count = column.null_count if not include_nulls else 0
171
150
  return Column(
172
151
  plc.Column.from_scalar(
173
- plc.interop.from_arrow(
174
- pa.scalar(
175
- column.size - column.null_count,
176
- type=plc.interop.to_arrow(self.dtype),
177
- ),
178
- ),
152
+ plc.Scalar.from_py(column.size - null_count, self.dtype),
179
153
  1,
180
154
  )
181
155
  )
@@ -184,9 +158,7 @@ class Agg(Expr):
184
158
  if column.size == 0 or column.null_count == column.size:
185
159
  return Column(
186
160
  plc.Column.from_scalar(
187
- plc.interop.from_arrow(
188
- pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
189
- ),
161
+ plc.Scalar.from_py(0, self.dtype),
190
162
  1,
191
163
  )
192
164
  )
@@ -196,9 +168,7 @@ class Agg(Expr):
196
168
  if propagate_nans and column.nan_count > 0:
197
169
  return Column(
198
170
  plc.Column.from_scalar(
199
- plc.interop.from_arrow(
200
- pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
201
- ),
171
+ plc.Scalar.from_py(float("nan"), self.dtype),
202
172
  1,
203
173
  )
204
174
  )
@@ -210,9 +180,7 @@ class Agg(Expr):
210
180
  if propagate_nans and column.nan_count > 0:
211
181
  return Column(
212
182
  plc.Column.from_scalar(
213
- plc.interop.from_arrow(
214
- pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
215
- ),
183
+ plc.Scalar.from_py(float("nan"), self.dtype),
216
184
  1,
217
185
  )
218
186
  )
@@ -228,11 +196,7 @@ class Agg(Expr):
228
196
  return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
229
197
 
230
198
  def do_evaluate(
231
- self,
232
- df: DataFrame,
233
- *,
234
- context: ExecutionContext = ExecutionContext.FRAME,
235
- mapping: Mapping[Expr, Column] | None = None,
199
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
236
200
  ) -> Column:
237
201
  """Evaluate this expression given a dataframe for context."""
238
202
  if context is not ExecutionContext.FRAME:
@@ -243,4 +207,4 @@ class Agg(Expr):
243
207
  # Aggregations like quantiles may have additional children that were
244
208
  # preprocessed into pylibcudf requests.
245
209
  child = self.children[0]
246
- return self.op(child.evaluate(df, context=context, mapping=mapping))
210
+ return self.op(child.evaluate(df, context=context))
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -16,7 +16,7 @@ from cudf_polars.containers import Column
16
16
  from cudf_polars.dsl.nodebase import Node
17
17
 
18
18
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
19
+ from typing_extensions import Self
20
20
 
21
21
  from cudf_polars.containers import Column, DataFrame
22
22
 
@@ -46,11 +46,7 @@ class Expr(Node["Expr"]):
46
46
  """Names of non-child data (not Exprs) for reconstruction."""
47
47
 
48
48
  def do_evaluate(
49
- self,
50
- df: DataFrame,
51
- *,
52
- context: ExecutionContext = ExecutionContext.FRAME,
53
- mapping: Mapping[Expr, Column] | None = None,
49
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
54
50
  ) -> Column:
55
51
  """
56
52
  Evaluate this expression given a dataframe for context.
@@ -61,15 +57,10 @@ class Expr(Node["Expr"]):
61
57
  DataFrame that will provide columns.
62
58
  context
63
59
  What context are we performing this evaluation in?
64
- mapping
65
- Substitution mapping from expressions to Columns, used to
66
- override the evaluation of a given expression if we're
67
- performing a simple rewritten evaluation.
68
60
 
69
61
  Notes
70
62
  -----
71
- Do not call this function directly, but rather
72
- :meth:`evaluate` which handles the mapping lookups.
63
+ Do not call this function directly, but rather :meth:`evaluate`.
73
64
 
74
65
  Returns
75
66
  -------
@@ -87,11 +78,7 @@ class Expr(Node["Expr"]):
87
78
  ) # pragma: no cover; translation of unimplemented nodes trips first
88
79
 
89
80
  def evaluate(
90
- self,
91
- df: DataFrame,
92
- *,
93
- context: ExecutionContext = ExecutionContext.FRAME,
94
- mapping: Mapping[Expr, Column] | None = None,
81
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
95
82
  ) -> Column:
96
83
  """
97
84
  Evaluate this expression given a dataframe for context.
@@ -102,10 +89,6 @@ class Expr(Node["Expr"]):
102
89
  DataFrame that will provide columns.
103
90
  context
104
91
  What context are we performing this evaluation in?
105
- mapping
106
- Substitution mapping from expressions to Columns, used to
107
- override the evaluation of a given expression if we're
108
- performing a simple rewritten evaluation.
109
92
 
110
93
  Notes
111
94
  -----
@@ -124,37 +107,28 @@ class Expr(Node["Expr"]):
124
107
  are returned during translation to the IR, but for now we
125
108
  are not perfect.
126
109
  """
127
- if mapping is None:
128
- return self.do_evaluate(df, context=context, mapping=mapping)
129
- try:
130
- return mapping[self]
131
- except KeyError:
132
- return self.do_evaluate(df, context=context, mapping=mapping)
133
-
134
- def collect_agg(self, *, depth: int) -> AggInfo:
135
- """
136
- Collect information about aggregations in groupbys.
110
+ return self.do_evaluate(df, context=context)
137
111
 
138
- Parameters
139
- ----------
140
- depth
141
- The depth of aggregating (reduction or sampling)
142
- expressions we are currently at.
112
+ @property
113
+ def agg_request(self) -> plc.aggregation.Aggregation:
114
+ """
115
+ The aggregation for this expression in a grouped aggregation.
143
116
 
144
117
  Returns
145
118
  -------
146
- Aggregation info describing the expression to aggregate in the
147
- groupby.
119
+ Aggregation request. Default is to collect the expression.
120
+
121
+ Notes
122
+ -----
123
+ This presumes that the IR translation has decomposed groupby
124
+ reductions only into cases we can handle.
148
125
 
149
126
  Raises
150
127
  ------
151
128
  NotImplementedError
152
- If we can't currently perform the aggregation request, for
153
- example nested aggregations like ``a.max().min()``.
129
+ If requesting an aggregation from an unexpected expression.
154
130
  """
155
- raise NotImplementedError(
156
- f"Collecting aggregation info for {type(self).__name__}"
157
- ) # pragma: no cover; check_agg trips first
131
+ return plc.aggregation.collect_list()
158
132
 
159
133
 
160
134
  class ErrorExpr(Expr):
@@ -166,7 +140,7 @@ class ErrorExpr(Expr):
166
140
  self.dtype = dtype
167
141
  self.error = error
168
142
  self.children = ()
169
- self.is_pointwise = True
143
+ self.is_pointwise = False
170
144
 
171
145
 
172
146
  class NamedExpr:
@@ -202,11 +176,7 @@ class NamedExpr:
202
176
  return not self.__eq__(other)
203
177
 
204
178
  def evaluate(
205
- self,
206
- df: DataFrame,
207
- *,
208
- context: ExecutionContext = ExecutionContext.FRAME,
209
- mapping: Mapping[Expr, Column] | None = None,
179
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
210
180
  ) -> Column:
211
181
  """
212
182
  Evaluate this expression given a dataframe for context.
@@ -217,8 +187,6 @@ class NamedExpr:
217
187
  DataFrame providing context
218
188
  context
219
189
  Execution context
220
- mapping
221
- Substitution mapping
222
190
 
223
191
  Returns
224
192
  -------
@@ -229,13 +197,25 @@ class NamedExpr:
229
197
  :meth:`Expr.evaluate` for details, this function just adds the
230
198
  name to a column produced from an expression.
231
199
  """
232
- return self.value.evaluate(df, context=context, mapping=mapping).rename(
233
- self.name
234
- )
200
+ return self.value.evaluate(df, context=context).rename(self.name)
201
+
202
+ def reconstruct(self, expr: Expr) -> Self:
203
+ """
204
+ Rebuild with a new `Expr` value.
205
+
206
+ Parameters
207
+ ----------
208
+ expr
209
+ New `Expr` value
235
210
 
236
- def collect_agg(self, *, depth: int) -> AggInfo:
237
- """Collect information about aggregations in groupbys."""
238
- return self.value.collect_agg(depth=depth)
211
+ Returns
212
+ -------
213
+ New `NamedExpr` with `expr` as the underlying expression.
214
+ The name of the original `NamedExpr` is preserved.
215
+ """
216
+ if expr is self.value:
217
+ return self
218
+ return type(self)(self.name, expr)
239
219
 
240
220
 
241
221
  class Col(Expr):
@@ -250,21 +230,13 @@ class Col(Expr):
250
230
  self.children = ()
251
231
 
252
232
  def do_evaluate(
253
- self,
254
- df: DataFrame,
255
- *,
256
- context: ExecutionContext = ExecutionContext.FRAME,
257
- mapping: Mapping[Expr, Column] | None = None,
233
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
258
234
  ) -> Column:
259
235
  """Evaluate this expression given a dataframe for context."""
260
236
  # Deliberately remove the name here so that we guarantee
261
237
  # evaluation of the IR produces names.
262
238
  return df.column_map[self.name].rename(None)
263
239
 
264
- def collect_agg(self, *, depth: int) -> AggInfo:
265
- """Collect information about aggregations in groupbys."""
266
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
267
-
268
240
 
269
241
  class ColRef(Expr):
270
242
  __slots__ = ("index", "table_ref")
@@ -288,11 +260,7 @@ class ColRef(Expr):
288
260
  self.children = (column,)
289
261
 
290
262
  def do_evaluate(
291
- self,
292
- df: DataFrame,
293
- *,
294
- context: ExecutionContext = ExecutionContext.FRAME,
295
- mapping: Mapping[Expr, Column] | None = None,
263
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
296
264
  ) -> Column:
297
265
  """Evaluate this expression given a dataframe for context."""
298
266
  raise NotImplementedError(