cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,495 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """A column, with some properties."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import functools
9
+ from typing import TYPE_CHECKING
10
+
11
+ import polars as pl
12
+ import polars.datatypes.convert
13
+ from polars.exceptions import InvalidOperationError
14
+
15
+ import pylibcudf as plc
16
+ from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats
17
+ from pylibcudf.strings.convert.convert_integers import (
18
+ from_integers,
19
+ is_integer,
20
+ to_integers,
21
+ )
22
+ from pylibcudf.traits import is_floating_point
23
+
24
+ from cudf_polars.containers import DataType
25
+ from cudf_polars.utils import conversion
26
+ from cudf_polars.utils.dtypes import is_order_preserving_cast
27
+
28
+ if TYPE_CHECKING:
29
+ from typing_extensions import Self
30
+
31
+ from cudf_polars.typing import (
32
+ ColumnHeader,
33
+ ColumnOptions,
34
+ DeserializedColumnOptions,
35
+ Slice,
36
+ )
37
+
38
+ __all__: list[str] = ["Column"]
39
+
40
+
41
+ def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
42
+ """Convert a Polars dtype short repr to a Polars dtype."""
43
+ # limitations of dtype_short_repr_to_dtype described in
44
+ # py-polars/polars/datatypes/convert.py#L299
45
+ if dtype_str.startswith("list["):
46
+ stripped = dtype_str.removeprefix("list[").removesuffix("]")
47
+ return pl.List(_dtype_short_repr_to_dtype(stripped))
48
+ pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
49
+ if pl_type is None:
50
+ raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
51
+ if isinstance(pl_type, polars.datatypes.DataTypeClass):
52
+ return pl_type()
53
+ else:
54
+ return pl_type
55
+
56
+
57
+ class Column:
58
+ """An immutable column with sortedness metadata."""
59
+
60
+ obj: plc.Column
61
+ is_sorted: plc.types.Sorted
62
+ order: plc.types.Order
63
+ null_order: plc.types.NullOrder
64
+ is_scalar: bool
65
+ # Optional name, only ever set by evaluation of NamedExpr nodes
66
+ # The internal evaluation should not care about the name.
67
+ name: str | None
68
+ dtype: DataType
69
+
70
+ def __init__(
71
+ self,
72
+ column: plc.Column,
73
+ dtype: DataType,
74
+ *,
75
+ is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
76
+ order: plc.types.Order = plc.types.Order.ASCENDING,
77
+ null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
78
+ name: str | None = None,
79
+ ):
80
+ self.obj = column
81
+ self.is_scalar = self.size == 1
82
+ self.name = name
83
+ self.dtype = dtype
84
+ self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
85
+
86
+ @classmethod
87
+ def deserialize(
88
+ cls, header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]
89
+ ) -> Self:
90
+ """
91
+ Create a Column from a serialized representation returned by `.serialize()`.
92
+
93
+ Parameters
94
+ ----------
95
+ header
96
+ The (unpickled) metadata required to reconstruct the object.
97
+ frames
98
+ Two-tuple of frames (a memoryview and a gpumemoryview).
99
+
100
+ Returns
101
+ -------
102
+ Column
103
+ The deserialized Column.
104
+ """
105
+ packed_metadata, packed_gpu_data = frames
106
+ (plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
107
+ packed_metadata, packed_gpu_data
108
+ ).columns()
109
+ return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
110
+
111
+ @staticmethod
112
+ def deserialize_ctor_kwargs(
113
+ column_kwargs: ColumnOptions,
114
+ ) -> DeserializedColumnOptions:
115
+ """Deserialize the constructor kwargs for a Column."""
116
+ dtype = DataType( # pragma: no cover
117
+ _dtype_short_repr_to_dtype(column_kwargs["dtype"])
118
+ )
119
+ return {
120
+ "is_sorted": column_kwargs["is_sorted"],
121
+ "order": column_kwargs["order"],
122
+ "null_order": column_kwargs["null_order"],
123
+ "name": column_kwargs["name"],
124
+ "dtype": dtype,
125
+ }
126
+
127
+ def serialize(
128
+ self,
129
+ ) -> tuple[ColumnHeader, tuple[memoryview, plc.gpumemoryview]]:
130
+ """
131
+ Serialize the Column into header and frames.
132
+
133
+ Follows the Dask serialization scheme with a picklable header (dict) and
134
+ a tuple of frames (in this case a contiguous host and device buffer).
135
+
136
+ To enable dask support, dask serializers must be registered
137
+
138
+ >>> from cudf_polars.experimental.dask_serialize import register
139
+ >>> register()
140
+
141
+ Returns
142
+ -------
143
+ header
144
+ A dict containing any picklable metadata required to reconstruct the object.
145
+ frames
146
+ Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
147
+ """
148
+ packed = plc.contiguous_split.pack(plc.Table([self.obj]))
149
+ header: ColumnHeader = {
150
+ "column_kwargs": self.serialize_ctor_kwargs(),
151
+ "frame_count": 2,
152
+ }
153
+ return header, packed.release()
154
+
155
+ def serialize_ctor_kwargs(self) -> ColumnOptions:
156
+ """Serialize the constructor kwargs for self."""
157
+ return {
158
+ "is_sorted": self.is_sorted,
159
+ "order": self.order,
160
+ "null_order": self.null_order,
161
+ "name": self.name,
162
+ "dtype": pl.polars.dtype_str_repr(self.dtype.polars),
163
+ }
164
+
165
+ @functools.cached_property
166
+ def obj_scalar(self) -> plc.Scalar:
167
+ """
168
+ A copy of the column object as a pylibcudf Scalar.
169
+
170
+ Returns
171
+ -------
172
+ pylibcudf Scalar object.
173
+
174
+ Raises
175
+ ------
176
+ ValueError
177
+ If the column is not length-1.
178
+ """
179
+ if not self.is_scalar:
180
+ raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
181
+ return plc.copying.get_element(self.obj, 0)
182
+
183
+ def rename(self, name: str | None, /) -> Self:
184
+ """
185
+ Return a shallow copy with a new name.
186
+
187
+ Parameters
188
+ ----------
189
+ name
190
+ New name
191
+
192
+ Returns
193
+ -------
194
+ Shallow copy of self with new name set.
195
+ """
196
+ new = self.copy()
197
+ new.name = name
198
+ return new
199
+
200
+ def sorted_like(self, like: Column, /) -> Self:
201
+ """
202
+ Return a shallow copy with sortedness from like.
203
+
204
+ Parameters
205
+ ----------
206
+ like
207
+ The column to copy sortedness metadata from.
208
+
209
+ Returns
210
+ -------
211
+ Shallow copy of self with metadata set.
212
+
213
+ See Also
214
+ --------
215
+ set_sorted, copy_metadata
216
+ """
217
+ return type(self)(
218
+ self.obj,
219
+ name=self.name,
220
+ dtype=self.dtype,
221
+ is_sorted=like.is_sorted,
222
+ order=like.order,
223
+ null_order=like.null_order,
224
+ )
225
+
226
+ def check_sorted(
227
+ self,
228
+ *,
229
+ order: plc.types.Order,
230
+ null_order: plc.types.NullOrder,
231
+ ) -> bool:
232
+ """
233
+ Check if the column is sorted.
234
+
235
+ Parameters
236
+ ----------
237
+ order
238
+ The requested sort order.
239
+ null_order
240
+ Where nulls sort to.
241
+
242
+ Returns
243
+ -------
244
+ True if the column is sorted, false otherwise.
245
+
246
+ Notes
247
+ -----
248
+ If the sortedness flag is not set, this launches a kernel to
249
+ check sortedness.
250
+ """
251
+ if self.size <= 1 or self.size == self.null_count:
252
+ return True
253
+ if self.is_sorted == plc.types.Sorted.YES:
254
+ return self.order == order and (
255
+ self.null_count == 0 or self.null_order == null_order
256
+ )
257
+ if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
258
+ self.sorted = plc.types.Sorted.YES
259
+ self.order = order
260
+ self.null_order = null_order
261
+ return True
262
+ return False
263
+
264
+ def astype(self, dtype: DataType) -> Column:
265
+ """
266
+ Cast the column to as the requested dtype.
267
+
268
+ Parameters
269
+ ----------
270
+ dtype
271
+ Datatype to cast to.
272
+
273
+ Returns
274
+ -------
275
+ Column of requested type.
276
+
277
+ Raises
278
+ ------
279
+ RuntimeError
280
+ If the cast is unsupported.
281
+
282
+ Notes
283
+ -----
284
+ This only produces a copy if the requested dtype doesn't match
285
+ the current one.
286
+ """
287
+ plc_dtype = dtype.plc
288
+ if self.obj.type() == plc_dtype:
289
+ return self
290
+
291
+ if (
292
+ plc_dtype.id() == plc.TypeId.STRING
293
+ or self.obj.type().id() == plc.TypeId.STRING
294
+ ):
295
+ return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
296
+ elif plc.traits.is_integral_not_bool(
297
+ self.obj.type()
298
+ ) and plc.traits.is_timestamp(plc_dtype):
299
+ upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64))
300
+ result = plc.column.Column(
301
+ plc_dtype,
302
+ upcasted.size(),
303
+ upcasted.data(),
304
+ upcasted.null_mask(),
305
+ upcasted.null_count(),
306
+ upcasted.offset(),
307
+ upcasted.children(),
308
+ )
309
+ return Column(result, dtype=dtype).sorted_like(self)
310
+ elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
311
+ self.obj.type()
312
+ ):
313
+ result = plc.column.Column(
314
+ plc.DataType(plc.TypeId.INT64),
315
+ self.obj.size(),
316
+ self.obj.data(),
317
+ self.obj.null_mask(),
318
+ self.obj.null_count(),
319
+ self.obj.offset(),
320
+ self.obj.children(),
321
+ )
322
+ return Column(plc.unary.cast(result, plc_dtype), dtype=dtype).sorted_like(
323
+ self
324
+ )
325
+ else:
326
+ result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
327
+ if is_order_preserving_cast(self.obj.type(), plc_dtype):
328
+ return result.sorted_like(self)
329
+ return result
330
+
331
+ def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column:
332
+ if dtype.id() == plc.TypeId.STRING:
333
+ if is_floating_point(self.obj.type()):
334
+ return from_floats(self.obj)
335
+ else:
336
+ return from_integers(self.obj)
337
+ else:
338
+ if is_floating_point(dtype):
339
+ floats = is_float(self.obj)
340
+ if not plc.reduce.reduce(
341
+ floats,
342
+ plc.aggregation.all(),
343
+ plc.DataType(plc.TypeId.BOOL8),
344
+ ).to_py():
345
+ raise InvalidOperationError("Conversion from `str` failed.")
346
+ return to_floats(self.obj, dtype)
347
+ else:
348
+ integers = is_integer(self.obj)
349
+ if not plc.reduce.reduce(
350
+ integers,
351
+ plc.aggregation.all(),
352
+ plc.DataType(plc.TypeId.BOOL8),
353
+ ).to_py():
354
+ raise InvalidOperationError("Conversion from `str` failed.")
355
+ return to_integers(self.obj, dtype)
356
+
357
+ def copy_metadata(self, from_: pl.Series, /) -> Self:
358
+ """
359
+ Copy metadata from a host series onto self.
360
+
361
+ Parameters
362
+ ----------
363
+ from_
364
+ Polars series to copy metadata from
365
+
366
+ Returns
367
+ -------
368
+ Self with metadata set.
369
+
370
+ See Also
371
+ --------
372
+ set_sorted, sorted_like
373
+ """
374
+ self.name = from_.name
375
+ if len(from_) <= 1:
376
+ return self
377
+ ascending = from_.flags["SORTED_ASC"]
378
+ descending = from_.flags["SORTED_DESC"]
379
+ if ascending or descending:
380
+ has_null_first = from_.item(0) is None
381
+ has_null_last = from_.item(-1) is None
382
+ order = (
383
+ plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
384
+ )
385
+ null_order = plc.types.NullOrder.BEFORE
386
+ if (descending and has_null_first) or (ascending and has_null_last):
387
+ null_order = plc.types.NullOrder.AFTER
388
+ return self.set_sorted(
389
+ is_sorted=plc.types.Sorted.YES,
390
+ order=order,
391
+ null_order=null_order,
392
+ )
393
+ return self
394
+
395
+ def set_sorted(
396
+ self,
397
+ *,
398
+ is_sorted: plc.types.Sorted,
399
+ order: plc.types.Order,
400
+ null_order: plc.types.NullOrder,
401
+ ) -> Self:
402
+ """
403
+ Modify sortedness metadata in place.
404
+
405
+ Parameters
406
+ ----------
407
+ is_sorted
408
+ Is the column sorted
409
+ order
410
+ The order if sorted
411
+ null_order
412
+ Where nulls sort, if sorted
413
+
414
+ Returns
415
+ -------
416
+ Self with metadata set.
417
+ """
418
+ if self.size <= 1:
419
+ is_sorted = plc.types.Sorted.YES
420
+ self.is_sorted = is_sorted
421
+ self.order = order
422
+ self.null_order = null_order
423
+ return self
424
+
425
+ def copy(self) -> Self:
426
+ """
427
+ A shallow copy of the column.
428
+
429
+ Returns
430
+ -------
431
+ New column sharing data with self.
432
+ """
433
+ return type(self)(
434
+ self.obj,
435
+ is_sorted=self.is_sorted,
436
+ order=self.order,
437
+ null_order=self.null_order,
438
+ name=self.name,
439
+ dtype=self.dtype,
440
+ )
441
+
442
+ def mask_nans(self) -> Self:
443
+ """Return a shallow copy of self with nans masked out."""
444
+ if plc.traits.is_floating_point(self.obj.type()):
445
+ old_count = self.null_count
446
+ mask, new_count = plc.transform.nans_to_nulls(self.obj)
447
+ result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
448
+ if old_count == new_count:
449
+ return result.sorted_like(self)
450
+ return result
451
+ return self.copy()
452
+
453
+ @functools.cached_property
454
+ def nan_count(self) -> int:
455
+ """Return the number of NaN values in the column."""
456
+ if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
457
+ return plc.reduce.reduce(
458
+ plc.unary.is_nan(self.obj),
459
+ plc.aggregation.sum(),
460
+ plc.types.SIZE_TYPE,
461
+ ).to_py()
462
+ return 0
463
+
464
+ @property
465
+ def size(self) -> int:
466
+ """Return the size of the column."""
467
+ return self.obj.size()
468
+
469
+ @property
470
+ def null_count(self) -> int:
471
+ """Return the number of Null values in the column."""
472
+ return self.obj.null_count()
473
+
474
+ def slice(self, zlice: Slice | None) -> Self:
475
+ """
476
+ Slice a column.
477
+
478
+ Parameters
479
+ ----------
480
+ zlice
481
+ optional, tuple of start and length, negative values of start
482
+ treated as for python indexing. If not provided, returns self.
483
+
484
+ Returns
485
+ -------
486
+ New column (if zlice is not None) otherwise self (if it is)
487
+ """
488
+ if zlice is None:
489
+ return self
490
+ (table,) = plc.copying.slice(
491
+ plc.Table([self.obj]),
492
+ conversion.from_polars_slice(zlice, num_rows=self.size),
493
+ )
494
+ (column,) = table.columns()
495
+ return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)