cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
17
  if TYPE_CHECKING:
18
18
  from typing_extensions import Self
19
19
 
20
- from polars.polars import _expr_nodes as pl_expr
20
+ from polars import polars # type: ignore[attr-defined]
21
21
 
22
22
  from cudf_polars.containers import DataFrame, DataType
23
23
 
@@ -75,7 +75,7 @@ class TemporalFunction(Expr):
75
75
  Year = auto()
76
76
 
77
77
  @classmethod
78
- def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self:
78
+ def from_polars(cls, obj: polars._expr_nodes.TemporalFunction) -> Self:
79
79
  """Convert from polars' `TemporalFunction`."""
80
80
  try:
81
81
  function, name = str(obj).split(".", maxsplit=1)
@@ -129,7 +129,7 @@ class TemporalFunction(Expr):
129
129
  raise NotImplementedError(f"Temporal function {self.name}")
130
130
 
131
131
  if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
132
- self.children[0].dtype.plc
132
+ self.children[0].dtype.plc_type
133
133
  ):
134
134
  raise NotImplementedError("ToString is not supported on duration types")
135
135
 
@@ -140,13 +140,19 @@ class TemporalFunction(Expr):
140
140
  columns = [child.evaluate(df, context=context) for child in self.children]
141
141
  (column,) = columns
142
142
  if self.name is TemporalFunction.Name.CastTimeUnit:
143
- return Column(plc.unary.cast(column.obj, self.dtype.plc), dtype=self.dtype)
143
+ return Column(
144
+ plc.unary.cast(column.obj, self.dtype.plc_type, stream=df.stream),
145
+ dtype=self.dtype,
146
+ )
144
147
  if self.name == TemporalFunction.Name.ToString:
145
148
  return Column(
146
149
  plc.strings.convert.convert_datetime.from_timestamps(
147
150
  column.obj,
148
151
  self.options[0],
149
- plc.Column.from_iterable_of_py([], dtype=self.dtype.plc),
152
+ plc.Column.from_iterable_of_py(
153
+ [], dtype=self.dtype.plc_type, stream=df.stream
154
+ ),
155
+ stream=df.stream,
150
156
  ),
151
157
  dtype=self.dtype,
152
158
  )
@@ -156,10 +162,12 @@ class TemporalFunction(Expr):
156
162
  column.obj,
157
163
  format="%V",
158
164
  input_strings_names=plc.Column.from_iterable_of_py(
159
- [], dtype=plc.DataType(plc.TypeId.STRING)
165
+ [], dtype=plc.DataType(plc.TypeId.STRING), stream=df.stream
160
166
  ),
167
+ stream=df.stream,
161
168
  ),
162
- self.dtype.plc,
169
+ self.dtype.plc_type,
170
+ stream=df.stream,
163
171
  )
164
172
  return Column(result, dtype=self.dtype)
165
173
  if self.name is TemporalFunction.Name.IsoYear:
@@ -168,97 +176,117 @@ class TemporalFunction(Expr):
168
176
  column.obj,
169
177
  format="%G",
170
178
  input_strings_names=plc.Column.from_iterable_of_py(
171
- [], dtype=plc.DataType(plc.TypeId.STRING)
179
+ [], dtype=plc.DataType(plc.TypeId.STRING), stream=df.stream
172
180
  ),
181
+ stream=df.stream,
173
182
  ),
174
- self.dtype.plc,
183
+ self.dtype.plc_type,
184
+ stream=df.stream,
175
185
  )
176
186
  return Column(result, dtype=self.dtype)
177
187
  if self.name is TemporalFunction.Name.MonthStart:
178
- ends = plc.datetime.last_day_of_month(column.obj)
179
- days_to_subtract = plc.datetime.days_in_month(column.obj)
188
+ ends = plc.datetime.last_day_of_month(column.obj, stream=df.stream)
189
+ days_to_subtract = plc.datetime.days_in_month(column.obj, stream=df.stream)
180
190
  # must subtract 1 to avoid rolling over to the previous month
181
191
  days_to_subtract = plc.binaryop.binary_operation(
182
192
  days_to_subtract,
183
- plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
193
+ plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32), stream=df.stream),
184
194
  plc.binaryop.BinaryOperator.SUB,
185
195
  plc.DataType(plc.TypeId.DURATION_DAYS),
196
+ stream=df.stream,
186
197
  )
187
198
  result = plc.binaryop.binary_operation(
188
199
  ends,
189
200
  days_to_subtract,
190
201
  plc.binaryop.BinaryOperator.SUB,
191
- self.dtype.plc,
202
+ self.dtype.plc_type,
203
+ stream=df.stream,
192
204
  )
193
205
 
194
206
  return Column(result, dtype=self.dtype)
195
207
  if self.name is TemporalFunction.Name.MonthEnd:
196
208
  return Column(
197
209
  plc.unary.cast(
198
- plc.datetime.last_day_of_month(column.obj), self.dtype.plc
210
+ plc.datetime.last_day_of_month(column.obj, stream=df.stream),
211
+ self.dtype.plc_type,
212
+ stream=df.stream,
199
213
  ),
200
214
  dtype=self.dtype,
201
215
  )
202
216
  if self.name is TemporalFunction.Name.IsLeapYear:
203
217
  return Column(
204
- plc.datetime.is_leap_year(column.obj),
218
+ plc.datetime.is_leap_year(column.obj, stream=df.stream),
205
219
  dtype=self.dtype,
206
220
  )
207
221
  if self.name is TemporalFunction.Name.OrdinalDay:
208
- return Column(plc.datetime.day_of_year(column.obj), dtype=self.dtype)
222
+ return Column(
223
+ plc.datetime.day_of_year(column.obj, stream=df.stream), dtype=self.dtype
224
+ )
209
225
  if self.name is TemporalFunction.Name.Microsecond:
210
226
  millis = plc.datetime.extract_datetime_component(
211
- column.obj, plc.datetime.DatetimeComponent.MILLISECOND
227
+ column.obj, plc.datetime.DatetimeComponent.MILLISECOND, stream=df.stream
212
228
  )
213
229
  micros = plc.datetime.extract_datetime_component(
214
- column.obj, plc.datetime.DatetimeComponent.MICROSECOND
230
+ column.obj, plc.datetime.DatetimeComponent.MICROSECOND, stream=df.stream
215
231
  )
216
232
  millis_as_micros = plc.binaryop.binary_operation(
217
233
  millis,
218
- plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
234
+ plc.Scalar.from_py(
235
+ 1_000, plc.DataType(plc.TypeId.INT32), stream=df.stream
236
+ ),
219
237
  plc.binaryop.BinaryOperator.MUL,
220
- self.dtype.plc,
238
+ self.dtype.plc_type,
239
+ stream=df.stream,
221
240
  )
222
241
  total_micros = plc.binaryop.binary_operation(
223
242
  micros,
224
243
  millis_as_micros,
225
244
  plc.binaryop.BinaryOperator.ADD,
226
- self.dtype.plc,
245
+ self.dtype.plc_type,
246
+ stream=df.stream,
227
247
  )
228
248
  return Column(total_micros, dtype=self.dtype)
229
249
  elif self.name is TemporalFunction.Name.Nanosecond:
230
250
  millis = plc.datetime.extract_datetime_component(
231
- column.obj, plc.datetime.DatetimeComponent.MILLISECOND
251
+ column.obj, plc.datetime.DatetimeComponent.MILLISECOND, stream=df.stream
232
252
  )
233
253
  micros = plc.datetime.extract_datetime_component(
234
- column.obj, plc.datetime.DatetimeComponent.MICROSECOND
254
+ column.obj, plc.datetime.DatetimeComponent.MICROSECOND, stream=df.stream
235
255
  )
236
256
  nanos = plc.datetime.extract_datetime_component(
237
- column.obj, plc.datetime.DatetimeComponent.NANOSECOND
257
+ column.obj, plc.datetime.DatetimeComponent.NANOSECOND, stream=df.stream
238
258
  )
239
259
  millis_as_nanos = plc.binaryop.binary_operation(
240
260
  millis,
241
- plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
261
+ plc.Scalar.from_py(
262
+ 1_000_000, plc.DataType(plc.TypeId.INT32), stream=df.stream
263
+ ),
242
264
  plc.binaryop.BinaryOperator.MUL,
243
- self.dtype.plc,
265
+ self.dtype.plc_type,
266
+ stream=df.stream,
244
267
  )
245
268
  micros_as_nanos = plc.binaryop.binary_operation(
246
269
  micros,
247
- plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
270
+ plc.Scalar.from_py(
271
+ 1_000, plc.DataType(plc.TypeId.INT32), stream=df.stream
272
+ ),
248
273
  plc.binaryop.BinaryOperator.MUL,
249
- self.dtype.plc,
274
+ self.dtype.plc_type,
275
+ stream=df.stream,
250
276
  )
251
277
  total_nanos = plc.binaryop.binary_operation(
252
278
  nanos,
253
279
  millis_as_nanos,
254
280
  plc.binaryop.BinaryOperator.ADD,
255
- self.dtype.plc,
281
+ self.dtype.plc_type,
282
+ stream=df.stream,
256
283
  )
257
284
  total_nanos = plc.binaryop.binary_operation(
258
285
  total_nanos,
259
286
  micros_as_nanos,
260
287
  plc.binaryop.BinaryOperator.ADD,
261
- self.dtype.plc,
288
+ self.dtype.plc_type,
289
+ stream=df.stream,
262
290
  )
263
291
  return Column(total_nanos, dtype=self.dtype)
264
292
 
@@ -266,6 +294,7 @@ class TemporalFunction(Expr):
266
294
  plc.datetime.extract_datetime_component(
267
295
  column.obj,
268
296
  self._COMPONENT_MAP[self.name],
297
+ stream=df.stream,
269
298
  ),
270
299
  dtype=self.dtype,
271
300
  )
@@ -43,7 +43,11 @@ class Literal(Expr):
43
43
  ) -> Column:
44
44
  """Evaluate this expression given a dataframe for context."""
45
45
  return Column(
46
- plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype.plc), 1),
46
+ plc.Column.from_scalar(
47
+ plc.Scalar.from_py(self.value, self.dtype.plc_type, stream=df.stream),
48
+ 1,
49
+ stream=df.stream,
50
+ ),
47
51
  dtype=self.dtype,
48
52
  )
49
53
 
@@ -60,8 +64,8 @@ class Literal(Expr):
60
64
  else:
61
65
  # Use polars to cast instead of pylibcudf
62
66
  # since there are just Python scalars
63
- casted = pl.Series(values=[self.value], dtype=self.dtype.polars).cast(
64
- dtype.polars
67
+ casted = pl.Series(values=[self.value], dtype=self.dtype.polars_type).cast(
68
+ dtype.polars_type
65
69
  )[0]
66
70
  return Literal(dtype, casted)
67
71
 
@@ -82,13 +86,15 @@ class LiteralColumn(Expr):
82
86
  # This is stricter than necessary, but we only need this hash
83
87
  # for identity in groupby replacements so it's OK. And this
84
88
  # way we avoid doing potentially expensive compute.
85
- return (type(self), self.dtype.plc, id(self.value))
89
+ return (type(self), self.dtype.plc_type, id(self.value))
86
90
 
87
91
  def do_evaluate(
88
92
  self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
89
93
  ) -> Column:
90
94
  """Evaluate this expression given a dataframe for context."""
91
- return Column(plc.Column.from_arrow(self.value), dtype=self.dtype)
95
+ return Column(
96
+ plc.Column.from_arrow(self.value, stream=df.stream), dtype=self.dtype
97
+ )
92
98
 
93
99
  @property
94
100
  def agg_request(self) -> NoReturn: # noqa: D102