cudf-polars-cu12 25.8.0__py3-none-any.whl → 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +1 -10
- cudf_polars/containers/column.py +29 -0
- cudf_polars/containers/dataframe.py +15 -6
- cudf_polars/containers/datatype.py +2 -0
- cudf_polars/dsl/expressions/base.py +4 -0
- cudf_polars/dsl/expressions/boolean.py +41 -20
- cudf_polars/dsl/expressions/datetime.py +1 -0
- cudf_polars/dsl/expressions/rolling.py +487 -7
- cudf_polars/dsl/expressions/string.py +275 -21
- cudf_polars/dsl/expressions/struct.py +3 -4
- cudf_polars/dsl/expressions/unary.py +148 -32
- cudf_polars/dsl/ir.py +244 -61
- cudf_polars/dsl/translate.py +68 -21
- cudf_polars/dsl/utils/aggregations.py +201 -21
- cudf_polars/dsl/utils/groupby.py +6 -1
- cudf_polars/dsl/utils/rolling.py +7 -1
- cudf_polars/dsl/utils/windows.py +7 -1
- cudf_polars/experimental/base.py +258 -25
- cudf_polars/experimental/benchmarks/pdsds.py +8 -4
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +2 -0
- cudf_polars/experimental/benchmarks/utils.py +140 -33
- cudf_polars/experimental/dispatch.py +58 -1
- cudf_polars/experimental/distinct.py +3 -0
- cudf_polars/experimental/explain.py +33 -3
- cudf_polars/experimental/expressions.py +45 -2
- cudf_polars/experimental/groupby.py +3 -0
- cudf_polars/experimental/io.py +80 -39
- cudf_polars/experimental/join.py +5 -3
- cudf_polars/experimental/parallel.py +39 -6
- cudf_polars/experimental/select.py +32 -5
- cudf_polars/experimental/shuffle.py +82 -30
- cudf_polars/experimental/sort.py +575 -11
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +65 -8
- cudf_polars/testing/asserts.py +23 -12
- cudf_polars/testing/io.py +52 -2
- cudf_polars/testing/plugin.py +14 -23
- cudf_polars/utils/config.py +154 -18
- cudf_polars/utils/dtypes.py +17 -4
- cudf_polars/utils/versions.py +3 -1
- {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/METADATA +18 -7
- cudf_polars_cu12-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu12-25.8.0.dist-info/RECORD +0 -81
- {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/WHEEL +0 -0
- {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.8.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
f4e35ca02118eada383e7417273c6cb1857ec66e
|
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.10.00
|
cudf_polars/callback.py
CHANGED
|
@@ -40,14 +40,6 @@ if TYPE_CHECKING:
|
|
|
40
40
|
__all__: list[str] = ["execute_with_cudf"]
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
_SUPPORTED_PREFETCHES = {
|
|
44
|
-
"column_view::get_data",
|
|
45
|
-
"mutable_column_view::get_data",
|
|
46
|
-
"gather",
|
|
47
|
-
"hash_join",
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
|
|
51
43
|
@cache
|
|
52
44
|
def default_memory_resource(
|
|
53
45
|
device: int,
|
|
@@ -80,8 +72,7 @@ def default_memory_resource(
|
|
|
80
72
|
# Leaving a 20% headroom to avoid OOM errors.
|
|
81
73
|
free_memory, _ = rmm.mr.available_device_memory()
|
|
82
74
|
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
|
|
83
|
-
|
|
84
|
-
pylibcudf.experimental.enable_prefetching(key)
|
|
75
|
+
pylibcudf.prefetch.enable()
|
|
85
76
|
mr = rmm.mr.PrefetchResourceAdaptor(
|
|
86
77
|
rmm.mr.PoolMemoryResource(
|
|
87
78
|
rmm.mr.ManagedMemoryResource(),
|
cudf_polars/containers/column.py
CHANGED
|
@@ -293,6 +293,35 @@ class Column:
|
|
|
293
293
|
or self.obj.type().id() == plc.TypeId.STRING
|
|
294
294
|
):
|
|
295
295
|
return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
|
|
296
|
+
elif plc.traits.is_integral_not_bool(
|
|
297
|
+
self.obj.type()
|
|
298
|
+
) and plc.traits.is_timestamp(plc_dtype):
|
|
299
|
+
upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64))
|
|
300
|
+
result = plc.column.Column(
|
|
301
|
+
plc_dtype,
|
|
302
|
+
upcasted.size(),
|
|
303
|
+
upcasted.data(),
|
|
304
|
+
upcasted.null_mask(),
|
|
305
|
+
upcasted.null_count(),
|
|
306
|
+
upcasted.offset(),
|
|
307
|
+
upcasted.children(),
|
|
308
|
+
)
|
|
309
|
+
return Column(result, dtype=dtype).sorted_like(self)
|
|
310
|
+
elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
|
|
311
|
+
self.obj.type()
|
|
312
|
+
):
|
|
313
|
+
result = plc.column.Column(
|
|
314
|
+
plc.DataType(plc.TypeId.INT64),
|
|
315
|
+
self.obj.size(),
|
|
316
|
+
self.obj.data(),
|
|
317
|
+
self.obj.null_mask(),
|
|
318
|
+
self.obj.null_count(),
|
|
319
|
+
self.obj.offset(),
|
|
320
|
+
self.obj.children(),
|
|
321
|
+
)
|
|
322
|
+
return Column(plc.unary.cast(result, plc_dtype), dtype=dtype).sorted_like(
|
|
323
|
+
self
|
|
324
|
+
)
|
|
296
325
|
else:
|
|
297
326
|
result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
|
|
298
327
|
if is_order_preserving_cast(self.obj.type(), plc_dtype):
|
|
@@ -29,24 +29,33 @@ __all__: list[str] = ["DataFrame"]
|
|
|
29
29
|
def _create_polars_column_metadata(
|
|
30
30
|
name: str, dtype: PolarsDataType
|
|
31
31
|
) -> plc.interop.ColumnMetadata:
|
|
32
|
-
"""Create ColumnMetadata preserving
|
|
32
|
+
"""Create ColumnMetadata preserving dtype attributes not supported by libcudf."""
|
|
33
|
+
children_meta = []
|
|
34
|
+
timezone = ""
|
|
35
|
+
precision: int | None = None
|
|
36
|
+
|
|
33
37
|
if isinstance(dtype, pl.Struct):
|
|
34
38
|
children_meta = [
|
|
35
39
|
_create_polars_column_metadata(field.name, field.dtype)
|
|
36
40
|
for field in dtype.fields
|
|
37
41
|
]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
elif isinstance(dtype, pl.Datetime):
|
|
43
|
+
timezone = dtype.time_zone or timezone
|
|
44
|
+
elif isinstance(dtype, pl.Decimal):
|
|
45
|
+
precision = dtype.precision
|
|
46
|
+
|
|
41
47
|
return plc.interop.ColumnMetadata(
|
|
42
|
-
name=name,
|
|
48
|
+
name=name,
|
|
49
|
+
timezone=timezone,
|
|
50
|
+
precision=precision,
|
|
51
|
+
children_meta=children_meta,
|
|
43
52
|
)
|
|
44
53
|
|
|
45
54
|
|
|
46
55
|
# This is also defined in pylibcudf.interop
|
|
47
56
|
class _ObjectWithArrowMetadata:
|
|
48
57
|
def __init__(
|
|
49
|
-
self, obj: plc.Table, metadata: list[plc.interop.ColumnMetadata]
|
|
58
|
+
self, obj: plc.Table | plc.Column, metadata: list[plc.interop.ColumnMetadata]
|
|
50
59
|
) -> None:
|
|
51
60
|
self.obj = obj
|
|
52
61
|
self.metadata = metadata
|
|
@@ -81,6 +81,8 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType:
|
|
|
81
81
|
assert_never(dtype.time_unit)
|
|
82
82
|
elif isinstance(dtype, pl.String):
|
|
83
83
|
return plc.DataType(plc.TypeId.STRING)
|
|
84
|
+
elif isinstance(dtype, pl.Decimal):
|
|
85
|
+
return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale)
|
|
84
86
|
elif isinstance(dtype, pl.Null):
|
|
85
87
|
# TODO: Hopefully
|
|
86
88
|
return plc.DataType(plc.TypeId.EMPTY)
|
|
@@ -31,6 +31,10 @@ class ExecutionContext(IntEnum):
|
|
|
31
31
|
FRAME = enum.auto()
|
|
32
32
|
GROUPBY = enum.auto()
|
|
33
33
|
ROLLING = enum.auto()
|
|
34
|
+
# Follows GROUPBY semantics but useful
|
|
35
|
+
# to differentiate from GROUPBY so we can
|
|
36
|
+
# implement agg/per-row ops independently
|
|
37
|
+
WINDOW = enum.auto()
|
|
34
38
|
|
|
35
39
|
|
|
36
40
|
class Expr(Node["Expr"]):
|
|
@@ -38,6 +38,7 @@ class BooleanFunction(Expr):
|
|
|
38
38
|
Any = auto()
|
|
39
39
|
AnyHorizontal = auto()
|
|
40
40
|
IsBetween = auto()
|
|
41
|
+
IsClose = auto()
|
|
41
42
|
IsDuplicated = auto()
|
|
42
43
|
IsFinite = auto()
|
|
43
44
|
IsFirstDistinct = auto()
|
|
@@ -85,6 +86,12 @@ class BooleanFunction(Expr):
|
|
|
85
86
|
BooleanFunction.Name.IsLastDistinct,
|
|
86
87
|
BooleanFunction.Name.IsUnique,
|
|
87
88
|
)
|
|
89
|
+
if self.name in {
|
|
90
|
+
BooleanFunction.Name.IsClose,
|
|
91
|
+
}:
|
|
92
|
+
raise NotImplementedError(
|
|
93
|
+
f"Boolean function {self.name}"
|
|
94
|
+
) # pragma: no cover
|
|
88
95
|
|
|
89
96
|
@staticmethod
|
|
90
97
|
def _distinct(
|
|
@@ -146,13 +153,18 @@ class BooleanFunction(Expr):
|
|
|
146
153
|
):
|
|
147
154
|
# Avoid evaluating the child if the dtype tells us it's unnecessary.
|
|
148
155
|
(child,) = self.children
|
|
156
|
+
needles = child.evaluate(df, context=context)
|
|
157
|
+
is_float = needles.obj.type().id() in (
|
|
158
|
+
plc.TypeId.FLOAT32,
|
|
159
|
+
plc.TypeId.FLOAT64,
|
|
160
|
+
)
|
|
149
161
|
is_finite = self.name is BooleanFunction.Name.IsFinite
|
|
150
|
-
if
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
plc.Column.from_scalar(value, df.num_rows), dtype=self.dtype
|
|
162
|
+
if not is_float:
|
|
163
|
+
base = plc.Column.from_scalar(
|
|
164
|
+
plc.Scalar.from_py(py_val=is_finite), needles.size
|
|
154
165
|
)
|
|
155
|
-
|
|
166
|
+
out = base.with_mask(needles.obj.null_mask(), needles.null_count)
|
|
167
|
+
return Column(out, dtype=self.dtype)
|
|
156
168
|
to_search = [-float("inf"), float("inf")]
|
|
157
169
|
if is_finite:
|
|
158
170
|
# NaN is neither finite not infinite
|
|
@@ -164,7 +176,10 @@ class BooleanFunction(Expr):
|
|
|
164
176
|
result = plc.search.contains(haystack, needles.obj)
|
|
165
177
|
if is_finite:
|
|
166
178
|
result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
|
|
167
|
-
return Column(
|
|
179
|
+
return Column(
|
|
180
|
+
result.with_mask(needles.obj.null_mask(), needles.null_count),
|
|
181
|
+
dtype=self.dtype,
|
|
182
|
+
)
|
|
168
183
|
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
169
184
|
# Kleene logic for Any (OR) and All (AND) if ignore_nulls is
|
|
170
185
|
# False
|
|
@@ -199,22 +214,28 @@ class BooleanFunction(Expr):
|
|
|
199
214
|
elif self.name is BooleanFunction.Name.IsNotNull:
|
|
200
215
|
(column,) = columns
|
|
201
216
|
return Column(plc.unary.is_valid(column.obj), dtype=self.dtype)
|
|
202
|
-
elif self.name
|
|
217
|
+
elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
|
|
203
218
|
(column,) = columns
|
|
204
|
-
|
|
205
|
-
plc.
|
|
206
|
-
|
|
207
|
-
),
|
|
208
|
-
dtype=self.dtype,
|
|
209
|
-
)
|
|
210
|
-
elif self.name is BooleanFunction.Name.IsNotNan:
|
|
211
|
-
(column,) = columns
|
|
212
|
-
return Column(
|
|
213
|
-
plc.unary.is_not_nan(column.obj).with_mask(
|
|
214
|
-
column.obj.null_mask(), column.null_count
|
|
215
|
-
),
|
|
216
|
-
dtype=self.dtype,
|
|
219
|
+
is_float = column.obj.type().id() in (
|
|
220
|
+
plc.TypeId.FLOAT32,
|
|
221
|
+
plc.TypeId.FLOAT64,
|
|
217
222
|
)
|
|
223
|
+
if is_float:
|
|
224
|
+
op = (
|
|
225
|
+
plc.unary.is_nan
|
|
226
|
+
if self.name is BooleanFunction.Name.IsNan
|
|
227
|
+
else plc.unary.is_not_nan
|
|
228
|
+
)
|
|
229
|
+
base = op(column.obj)
|
|
230
|
+
else:
|
|
231
|
+
base = plc.Column.from_scalar(
|
|
232
|
+
plc.Scalar.from_py(
|
|
233
|
+
py_val=self.name is not BooleanFunction.Name.IsNan
|
|
234
|
+
),
|
|
235
|
+
column.size,
|
|
236
|
+
)
|
|
237
|
+
out = base.with_mask(column.obj.null_mask(), column.null_count)
|
|
238
|
+
return Column(out, dtype=self.dtype)
|
|
218
239
|
elif self.name is BooleanFunction.Name.IsFirstDistinct:
|
|
219
240
|
(column,) = columns
|
|
220
241
|
return self._distinct(
|