cudf-polars-cu12 25.8.0__py3-none-any.whl → 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +1 -10
  4. cudf_polars/containers/column.py +29 -0
  5. cudf_polars/containers/dataframe.py +15 -6
  6. cudf_polars/containers/datatype.py +2 -0
  7. cudf_polars/dsl/expressions/base.py +4 -0
  8. cudf_polars/dsl/expressions/boolean.py +41 -20
  9. cudf_polars/dsl/expressions/datetime.py +1 -0
  10. cudf_polars/dsl/expressions/rolling.py +487 -7
  11. cudf_polars/dsl/expressions/string.py +275 -21
  12. cudf_polars/dsl/expressions/struct.py +3 -4
  13. cudf_polars/dsl/expressions/unary.py +148 -32
  14. cudf_polars/dsl/ir.py +244 -61
  15. cudf_polars/dsl/translate.py +68 -21
  16. cudf_polars/dsl/utils/aggregations.py +201 -21
  17. cudf_polars/dsl/utils/groupby.py +6 -1
  18. cudf_polars/dsl/utils/rolling.py +7 -1
  19. cudf_polars/dsl/utils/windows.py +7 -1
  20. cudf_polars/experimental/base.py +258 -25
  21. cudf_polars/experimental/benchmarks/pdsds.py +8 -4
  22. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  23. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  24. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  25. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  26. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  27. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  28. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  29. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  30. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  31. cudf_polars/experimental/benchmarks/pdsh.py +2 -0
  32. cudf_polars/experimental/benchmarks/utils.py +140 -33
  33. cudf_polars/experimental/dispatch.py +58 -1
  34. cudf_polars/experimental/distinct.py +3 -0
  35. cudf_polars/experimental/explain.py +33 -3
  36. cudf_polars/experimental/expressions.py +45 -2
  37. cudf_polars/experimental/groupby.py +3 -0
  38. cudf_polars/experimental/io.py +80 -39
  39. cudf_polars/experimental/join.py +5 -3
  40. cudf_polars/experimental/parallel.py +39 -6
  41. cudf_polars/experimental/select.py +32 -5
  42. cudf_polars/experimental/shuffle.py +82 -30
  43. cudf_polars/experimental/sort.py +575 -11
  44. cudf_polars/experimental/statistics.py +795 -0
  45. cudf_polars/experimental/utils.py +65 -8
  46. cudf_polars/testing/asserts.py +23 -12
  47. cudf_polars/testing/io.py +52 -2
  48. cudf_polars/testing/plugin.py +14 -23
  49. cudf_polars/utils/config.py +154 -18
  50. cudf_polars/utils/dtypes.py +17 -4
  51. cudf_polars/utils/versions.py +3 -1
  52. {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/METADATA +18 -7
  53. cudf_polars_cu12-25.10.0.dist-info/RECORD +92 -0
  54. cudf_polars_cu12-25.8.0.dist-info/RECORD +0 -81
  55. {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/WHEEL +0 -0
  56. {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/licenses/LICENSE +0 -0
  57. {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT ADDED
@@ -0,0 +1 @@
1
+ f4e35ca02118eada383e7417273c6cb1857ec66e
cudf_polars/VERSION CHANGED
@@ -1 +1 @@
1
- 25.08.00
1
+ 25.10.00
cudf_polars/callback.py CHANGED
@@ -40,14 +40,6 @@ if TYPE_CHECKING:
40
40
  __all__: list[str] = ["execute_with_cudf"]
41
41
 
42
42
 
43
- _SUPPORTED_PREFETCHES = {
44
- "column_view::get_data",
45
- "mutable_column_view::get_data",
46
- "gather",
47
- "hash_join",
48
- }
49
-
50
-
51
43
  @cache
52
44
  def default_memory_resource(
53
45
  device: int,
@@ -80,8 +72,7 @@ def default_memory_resource(
80
72
  # Leaving a 20% headroom to avoid OOM errors.
81
73
  free_memory, _ = rmm.mr.available_device_memory()
82
74
  free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
83
- for key in _SUPPORTED_PREFETCHES:
84
- pylibcudf.experimental.enable_prefetching(key)
75
+ pylibcudf.prefetch.enable()
85
76
  mr = rmm.mr.PrefetchResourceAdaptor(
86
77
  rmm.mr.PoolMemoryResource(
87
78
  rmm.mr.ManagedMemoryResource(),
@@ -293,6 +293,35 @@ class Column:
293
293
  or self.obj.type().id() == plc.TypeId.STRING
294
294
  ):
295
295
  return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
296
+ elif plc.traits.is_integral_not_bool(
297
+ self.obj.type()
298
+ ) and plc.traits.is_timestamp(plc_dtype):
299
+ upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64))
300
+ result = plc.column.Column(
301
+ plc_dtype,
302
+ upcasted.size(),
303
+ upcasted.data(),
304
+ upcasted.null_mask(),
305
+ upcasted.null_count(),
306
+ upcasted.offset(),
307
+ upcasted.children(),
308
+ )
309
+ return Column(result, dtype=dtype).sorted_like(self)
310
+ elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
311
+ self.obj.type()
312
+ ):
313
+ result = plc.column.Column(
314
+ plc.DataType(plc.TypeId.INT64),
315
+ self.obj.size(),
316
+ self.obj.data(),
317
+ self.obj.null_mask(),
318
+ self.obj.null_count(),
319
+ self.obj.offset(),
320
+ self.obj.children(),
321
+ )
322
+ return Column(plc.unary.cast(result, plc_dtype), dtype=dtype).sorted_like(
323
+ self
324
+ )
296
325
  else:
297
326
  result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
298
327
  if is_order_preserving_cast(self.obj.type(), plc_dtype):
@@ -29,24 +29,33 @@ __all__: list[str] = ["DataFrame"]
29
29
  def _create_polars_column_metadata(
30
30
  name: str, dtype: PolarsDataType
31
31
  ) -> plc.interop.ColumnMetadata:
32
- """Create ColumnMetadata preserving pl.Struct field names."""
32
+ """Create ColumnMetadata preserving dtype attributes not supported by libcudf."""
33
+ children_meta = []
34
+ timezone = ""
35
+ precision: int | None = None
36
+
33
37
  if isinstance(dtype, pl.Struct):
34
38
  children_meta = [
35
39
  _create_polars_column_metadata(field.name, field.dtype)
36
40
  for field in dtype.fields
37
41
  ]
38
- else:
39
- children_meta = []
40
- timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None
42
+ elif isinstance(dtype, pl.Datetime):
43
+ timezone = dtype.time_zone or timezone
44
+ elif isinstance(dtype, pl.Decimal):
45
+ precision = dtype.precision
46
+
41
47
  return plc.interop.ColumnMetadata(
42
- name=name, timezone=timezone or "", children_meta=children_meta
48
+ name=name,
49
+ timezone=timezone,
50
+ precision=precision,
51
+ children_meta=children_meta,
43
52
  )
44
53
 
45
54
 
46
55
  # This is also defined in pylibcudf.interop
47
56
  class _ObjectWithArrowMetadata:
48
57
  def __init__(
49
- self, obj: plc.Table, metadata: list[plc.interop.ColumnMetadata]
58
+ self, obj: plc.Table | plc.Column, metadata: list[plc.interop.ColumnMetadata]
50
59
  ) -> None:
51
60
  self.obj = obj
52
61
  self.metadata = metadata
@@ -81,6 +81,8 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType:
81
81
  assert_never(dtype.time_unit)
82
82
  elif isinstance(dtype, pl.String):
83
83
  return plc.DataType(plc.TypeId.STRING)
84
+ elif isinstance(dtype, pl.Decimal):
85
+ return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale)
84
86
  elif isinstance(dtype, pl.Null):
85
87
  # TODO: Hopefully
86
88
  return plc.DataType(plc.TypeId.EMPTY)
@@ -31,6 +31,10 @@ class ExecutionContext(IntEnum):
31
31
  FRAME = enum.auto()
32
32
  GROUPBY = enum.auto()
33
33
  ROLLING = enum.auto()
34
+ # Follows GROUPBY semantics but useful
35
+ # to differentiate from GROUPBY so we can
36
+ # implement agg/per-row ops independently
37
+ WINDOW = enum.auto()
34
38
 
35
39
 
36
40
  class Expr(Node["Expr"]):
@@ -38,6 +38,7 @@ class BooleanFunction(Expr):
38
38
  Any = auto()
39
39
  AnyHorizontal = auto()
40
40
  IsBetween = auto()
41
+ IsClose = auto()
41
42
  IsDuplicated = auto()
42
43
  IsFinite = auto()
43
44
  IsFirstDistinct = auto()
@@ -85,6 +86,12 @@ class BooleanFunction(Expr):
85
86
  BooleanFunction.Name.IsLastDistinct,
86
87
  BooleanFunction.Name.IsUnique,
87
88
  )
89
+ if self.name in {
90
+ BooleanFunction.Name.IsClose,
91
+ }:
92
+ raise NotImplementedError(
93
+ f"Boolean function {self.name}"
94
+ ) # pragma: no cover
88
95
 
89
96
  @staticmethod
90
97
  def _distinct(
@@ -146,13 +153,18 @@ class BooleanFunction(Expr):
146
153
  ):
147
154
  # Avoid evaluating the child if the dtype tells us it's unnecessary.
148
155
  (child,) = self.children
156
+ needles = child.evaluate(df, context=context)
157
+ is_float = needles.obj.type().id() in (
158
+ plc.TypeId.FLOAT32,
159
+ plc.TypeId.FLOAT64,
160
+ )
149
161
  is_finite = self.name is BooleanFunction.Name.IsFinite
150
- if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
151
- value = plc.Scalar.from_py(is_finite)
152
- return Column(
153
- plc.Column.from_scalar(value, df.num_rows), dtype=self.dtype
162
+ if not is_float:
163
+ base = plc.Column.from_scalar(
164
+ plc.Scalar.from_py(py_val=is_finite), needles.size
154
165
  )
155
- needles = child.evaluate(df, context=context)
166
+ out = base.with_mask(needles.obj.null_mask(), needles.null_count)
167
+ return Column(out, dtype=self.dtype)
156
168
  to_search = [-float("inf"), float("inf")]
157
169
  if is_finite:
158
170
  # NaN is neither finite not infinite
@@ -164,7 +176,10 @@ class BooleanFunction(Expr):
164
176
  result = plc.search.contains(haystack, needles.obj)
165
177
  if is_finite:
166
178
  result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
167
- return Column(result, dtype=self.dtype)
179
+ return Column(
180
+ result.with_mask(needles.obj.null_mask(), needles.null_count),
181
+ dtype=self.dtype,
182
+ )
168
183
  columns = [child.evaluate(df, context=context) for child in self.children]
169
184
  # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
170
185
  # False
@@ -199,22 +214,28 @@ class BooleanFunction(Expr):
199
214
  elif self.name is BooleanFunction.Name.IsNotNull:
200
215
  (column,) = columns
201
216
  return Column(plc.unary.is_valid(column.obj), dtype=self.dtype)
202
- elif self.name is BooleanFunction.Name.IsNan:
217
+ elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
203
218
  (column,) = columns
204
- return Column(
205
- plc.unary.is_nan(column.obj).with_mask(
206
- column.obj.null_mask(), column.null_count
207
- ),
208
- dtype=self.dtype,
209
- )
210
- elif self.name is BooleanFunction.Name.IsNotNan:
211
- (column,) = columns
212
- return Column(
213
- plc.unary.is_not_nan(column.obj).with_mask(
214
- column.obj.null_mask(), column.null_count
215
- ),
216
- dtype=self.dtype,
219
+ is_float = column.obj.type().id() in (
220
+ plc.TypeId.FLOAT32,
221
+ plc.TypeId.FLOAT64,
217
222
  )
223
+ if is_float:
224
+ op = (
225
+ plc.unary.is_nan
226
+ if self.name is BooleanFunction.Name.IsNan
227
+ else plc.unary.is_not_nan
228
+ )
229
+ base = op(column.obj)
230
+ else:
231
+ base = plc.Column.from_scalar(
232
+ plc.Scalar.from_py(
233
+ py_val=self.name is not BooleanFunction.Name.IsNan
234
+ ),
235
+ column.size,
236
+ )
237
+ out = base.with_mask(column.obj.null_mask(), column.null_count)
238
+ return Column(out, dtype=self.dtype)
218
239
  elif self.name is BooleanFunction.Name.IsFirstDistinct:
219
240
  (column,) = columns
220
241
  return self._distinct(
@@ -38,6 +38,7 @@ class TemporalFunction(Expr):
38
38
  Datetime = auto()
39
39
  DatetimeFunction = auto()
40
40
  Day = auto()
41
+ DaysInMonth = auto()
41
42
  Duration = auto()
42
43
  Hour = auto()
43
44
  IsLeapYear = auto()