cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +28 -7
- cudf_polars/containers/column.py +51 -26
- cudf_polars/dsl/expressions/binaryop.py +1 -1
- cudf_polars/dsl/expressions/boolean.py +1 -1
- cudf_polars/dsl/expressions/selection.py +1 -1
- cudf_polars/dsl/expressions/string.py +29 -20
- cudf_polars/dsl/expressions/ternary.py +25 -1
- cudf_polars/dsl/expressions/unary.py +11 -8
- cudf_polars/dsl/ir.py +351 -281
- cudf_polars/dsl/translate.py +18 -15
- cudf_polars/dsl/utils/aggregations.py +10 -5
- cudf_polars/experimental/base.py +10 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1 -1
- cudf_polars/experimental/benchmarks/utils.py +83 -2
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +1 -1
- cudf_polars/experimental/expressions.py +8 -5
- cudf_polars/experimental/groupby.py +2 -0
- cudf_polars/experimental/io.py +64 -42
- cudf_polars/experimental/join.py +15 -2
- cudf_polars/experimental/parallel.py +10 -7
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
- cudf_polars/experimental/rapidsmpf/core.py +194 -67
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
- cudf_polars/experimental/rapidsmpf/io.py +162 -70
- cudf_polars/experimental/rapidsmpf/join.py +162 -77
- cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
- cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
- cudf_polars/experimental/rapidsmpf/union.py +24 -5
- cudf_polars/experimental/rapidsmpf/utils.py +228 -16
- cudf_polars/experimental/shuffle.py +18 -4
- cudf_polars/experimental/sort.py +13 -6
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/testing/plugin.py +6 -3
- cudf_polars/utils/config.py +67 -0
- cudf_polars/utils/versions.py +3 -3
- {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
- {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
- {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
cudf_polars/dsl/ir.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
"""
|
|
4
4
|
DSL nodes for the LogicalPlan of polars.
|
|
@@ -13,6 +13,7 @@ can be considered as functions:
|
|
|
13
13
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
|
+
import contextlib
|
|
16
17
|
import itertools
|
|
17
18
|
import json
|
|
18
19
|
import random
|
|
@@ -53,7 +54,7 @@ from cudf_polars.utils.cuda_stream import (
|
|
|
53
54
|
from cudf_polars.utils.versions import POLARS_VERSION_LT_131, POLARS_VERSION_LT_134
|
|
54
55
|
|
|
55
56
|
if TYPE_CHECKING:
|
|
56
|
-
from collections.abc import Callable, Hashable, Iterable, Sequence
|
|
57
|
+
from collections.abc import Callable, Generator, Hashable, Iterable, Sequence
|
|
57
58
|
from typing import Literal
|
|
58
59
|
|
|
59
60
|
from typing_extensions import Self
|
|
@@ -125,6 +126,50 @@ class IRExecutionContext:
|
|
|
125
126
|
f"Invalid CUDA stream policy: {config_options.cuda_stream_policy}"
|
|
126
127
|
)
|
|
127
128
|
|
|
129
|
+
@contextlib.contextmanager
|
|
130
|
+
def stream_ordered_after(self, *dfs: DataFrame) -> Generator[Stream, None, None]:
|
|
131
|
+
"""
|
|
132
|
+
Get a joined CUDA stream with safe stream ordering for deallocation of inputs.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
dfs
|
|
137
|
+
The dataframes being provided to stream-ordered operations.
|
|
138
|
+
|
|
139
|
+
Yields
|
|
140
|
+
------
|
|
141
|
+
A CUDA stream that is downstream of the given dataframes.
|
|
142
|
+
|
|
143
|
+
Notes
|
|
144
|
+
-----
|
|
145
|
+
This context manager provides two useful guarantees when working with
|
|
146
|
+
objects holding references to stream-ordered objects:
|
|
147
|
+
|
|
148
|
+
1. The stream yield upon entering the context manager is *downstream* of
|
|
149
|
+
all the input dataframes. This ensures that you can safely perform
|
|
150
|
+
stream-ordered operations on any input using the yielded stream.
|
|
151
|
+
2. The stream-ordered CUDA deallocation of the inputs happens *after* the
|
|
152
|
+
context manager exits. This ensures that all stream-ordered operations
|
|
153
|
+
submitted inside the context manager can complete before the memory
|
|
154
|
+
referenced by the inputs is deallocated.
|
|
155
|
+
|
|
156
|
+
Note that this does (deliberately) disconnect the dropping of the Python
|
|
157
|
+
object (by its refcount dropping to 0) from the actual stream-ordered
|
|
158
|
+
deallocation of the CUDA memory. This is precisely what we need to ensure
|
|
159
|
+
that the inputs are valid long enough for the stream-ordered operations to
|
|
160
|
+
complete.
|
|
161
|
+
"""
|
|
162
|
+
result_stream = get_joined_cuda_stream(
|
|
163
|
+
self.get_cuda_stream, upstreams=[df.stream for df in dfs]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
yield result_stream
|
|
167
|
+
|
|
168
|
+
# ensure that the inputs are downstream of result_stream (so that deallocation happens after the result is ready)
|
|
169
|
+
join_cuda_streams(
|
|
170
|
+
downstreams=[df.stream for df in dfs], upstreams=[result_stream]
|
|
171
|
+
)
|
|
172
|
+
|
|
128
173
|
|
|
129
174
|
_BINOPS = {
|
|
130
175
|
plc.binaryop.BinaryOperator.EQUAL,
|
|
@@ -320,7 +365,11 @@ def _cast_literal_to_decimal(
|
|
|
320
365
|
name = side.name
|
|
321
366
|
if (type_ := phys_type_map[name]).id() in _DECIMAL_IDS:
|
|
322
367
|
scale = abs(type_.scale())
|
|
323
|
-
return expr.Cast(
|
|
368
|
+
return expr.Cast(
|
|
369
|
+
side.dtype,
|
|
370
|
+
True, # noqa: FBT003
|
|
371
|
+
expr.Cast(DataType(pl.Decimal(38, scale)), True, lit), # noqa: FBT003
|
|
372
|
+
)
|
|
324
373
|
return lit
|
|
325
374
|
|
|
326
375
|
|
|
@@ -1291,6 +1340,42 @@ class DataFrameScan(IR):
|
|
|
1291
1340
|
self.children = ()
|
|
1292
1341
|
self._id_for_hash = random.randint(0, 2**64 - 1)
|
|
1293
1342
|
|
|
1343
|
+
@staticmethod
|
|
1344
|
+
def _reconstruct(
|
|
1345
|
+
schema: Schema,
|
|
1346
|
+
pl_df: pl.DataFrame,
|
|
1347
|
+
projection: Sequence[str] | None,
|
|
1348
|
+
id_for_hash: int,
|
|
1349
|
+
) -> DataFrameScan: # pragma: no cover
|
|
1350
|
+
"""
|
|
1351
|
+
Reconstruct a DataFrameScan from pickled data.
|
|
1352
|
+
|
|
1353
|
+
Parameters
|
|
1354
|
+
----------
|
|
1355
|
+
schema: Schema
|
|
1356
|
+
The schema of the DataFrameScan.
|
|
1357
|
+
pl_df: pl.DataFrame
|
|
1358
|
+
The underlying polars DataFrame.
|
|
1359
|
+
projection: Sequence[str] | None
|
|
1360
|
+
The projection of the DataFrameScan.
|
|
1361
|
+
id_for_hash: int
|
|
1362
|
+
The id for hash of the DataFrameScan.
|
|
1363
|
+
|
|
1364
|
+
Returns
|
|
1365
|
+
-------
|
|
1366
|
+
The reconstructed DataFrameScan.
|
|
1367
|
+
"""
|
|
1368
|
+
node = DataFrameScan(schema, pl_df._df, projection)
|
|
1369
|
+
node._id_for_hash = id_for_hash
|
|
1370
|
+
return node
|
|
1371
|
+
|
|
1372
|
+
def __reduce__(self) -> tuple[Any, ...]: # pragma: no cover
|
|
1373
|
+
"""Pickle a DataFrameScan object."""
|
|
1374
|
+
return (
|
|
1375
|
+
self._reconstruct,
|
|
1376
|
+
(*self._non_child_args, self._id_for_hash),
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1294
1379
|
def get_hashable(self) -> Hashable:
|
|
1295
1380
|
"""
|
|
1296
1381
|
Hashable representation of the node.
|
|
@@ -1307,6 +1392,17 @@ class DataFrameScan(IR):
|
|
|
1307
1392
|
self.projection,
|
|
1308
1393
|
)
|
|
1309
1394
|
|
|
1395
|
+
def is_equal(self, other: Self) -> bool:
|
|
1396
|
+
"""Equality of DataFrameScan nodes."""
|
|
1397
|
+
return self is other or (
|
|
1398
|
+
self._id_for_hash == other._id_for_hash
|
|
1399
|
+
and self.schema == other.schema
|
|
1400
|
+
and self.projection == other.projection
|
|
1401
|
+
and pl.DataFrame._from_pydf(self.df).equals(
|
|
1402
|
+
pl.DataFrame._from_pydf(other.df)
|
|
1403
|
+
)
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1310
1406
|
@classmethod
|
|
1311
1407
|
@log_do_evaluate
|
|
1312
1408
|
@nvtx_annotate_cudf_polars(message="DataFrameScan")
|
|
@@ -1761,7 +1857,12 @@ class GroupBy(IR):
|
|
|
1761
1857
|
col = child.evaluate(df, context=ExecutionContext.GROUPBY).obj
|
|
1762
1858
|
else:
|
|
1763
1859
|
# Anything else, we pre-evaluate
|
|
1764
|
-
|
|
1860
|
+
column = value.evaluate(df, context=ExecutionContext.GROUPBY)
|
|
1861
|
+
if column.size != keys[0].size:
|
|
1862
|
+
column = broadcast(
|
|
1863
|
+
column, target_length=keys[0].size, stream=df.stream
|
|
1864
|
+
)[0]
|
|
1865
|
+
col = column.obj
|
|
1765
1866
|
requests.append(plc.groupby.GroupByRequest(col, [value.agg_request]))
|
|
1766
1867
|
names.append(name)
|
|
1767
1868
|
group_keys, raw_tables = grouper.aggregate(requests, stream=df.stream)
|
|
@@ -2030,54 +2131,48 @@ class ConditionalJoin(IR):
|
|
|
2030
2131
|
context: IRExecutionContext,
|
|
2031
2132
|
) -> DataFrame:
|
|
2032
2133
|
"""Evaluate and return a dataframe."""
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
),
|
|
2039
|
-
)
|
|
2040
|
-
left_casts, right_casts = _collect_decimal_binop_casts(
|
|
2041
|
-
predicate_wrapper.predicate
|
|
2042
|
-
)
|
|
2043
|
-
_, _, zlice, suffix, _, _ = options
|
|
2044
|
-
|
|
2045
|
-
lg, rg = plc.join.conditional_inner_join(
|
|
2046
|
-
_apply_casts(left, left_casts).table,
|
|
2047
|
-
_apply_casts(right, right_casts).table,
|
|
2048
|
-
predicate_wrapper.ast,
|
|
2049
|
-
stream=stream,
|
|
2050
|
-
)
|
|
2051
|
-
left_result = DataFrame.from_table(
|
|
2052
|
-
plc.copying.gather(
|
|
2053
|
-
left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK, stream=stream
|
|
2054
|
-
),
|
|
2055
|
-
left.column_names,
|
|
2056
|
-
left.dtypes,
|
|
2057
|
-
stream=stream,
|
|
2058
|
-
)
|
|
2059
|
-
right_result = DataFrame.from_table(
|
|
2060
|
-
plc.copying.gather(
|
|
2061
|
-
right.table, rg, plc.copying.OutOfBoundsPolicy.DONT_CHECK, stream=stream
|
|
2062
|
-
),
|
|
2063
|
-
right.column_names,
|
|
2064
|
-
right.dtypes,
|
|
2065
|
-
stream=stream,
|
|
2066
|
-
)
|
|
2067
|
-
right_result = right_result.rename_columns(
|
|
2068
|
-
{
|
|
2069
|
-
name: f"{name}{suffix}"
|
|
2070
|
-
for name in right.column_names
|
|
2071
|
-
if name in left.column_names_set
|
|
2072
|
-
}
|
|
2073
|
-
)
|
|
2074
|
-
result = left_result.with_columns(right_result.columns, stream=stream)
|
|
2134
|
+
with context.stream_ordered_after(left, right) as stream:
|
|
2135
|
+
left_casts, right_casts = _collect_decimal_binop_casts(
|
|
2136
|
+
predicate_wrapper.predicate
|
|
2137
|
+
)
|
|
2138
|
+
_, _, zlice, suffix, _, _ = options
|
|
2075
2139
|
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
|
|
2079
|
-
|
|
2080
|
-
|
|
2140
|
+
lg, rg = plc.join.conditional_inner_join(
|
|
2141
|
+
_apply_casts(left, left_casts).table,
|
|
2142
|
+
_apply_casts(right, right_casts).table,
|
|
2143
|
+
predicate_wrapper.ast,
|
|
2144
|
+
stream=stream,
|
|
2145
|
+
)
|
|
2146
|
+
left_result = DataFrame.from_table(
|
|
2147
|
+
plc.copying.gather(
|
|
2148
|
+
left.table,
|
|
2149
|
+
lg,
|
|
2150
|
+
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
|
|
2151
|
+
stream=stream,
|
|
2152
|
+
),
|
|
2153
|
+
left.column_names,
|
|
2154
|
+
left.dtypes,
|
|
2155
|
+
stream=stream,
|
|
2156
|
+
)
|
|
2157
|
+
right_result = DataFrame.from_table(
|
|
2158
|
+
plc.copying.gather(
|
|
2159
|
+
right.table,
|
|
2160
|
+
rg,
|
|
2161
|
+
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
|
|
2162
|
+
stream=stream,
|
|
2163
|
+
),
|
|
2164
|
+
right.column_names,
|
|
2165
|
+
right.dtypes,
|
|
2166
|
+
stream=stream,
|
|
2167
|
+
)
|
|
2168
|
+
right_result = right_result.rename_columns(
|
|
2169
|
+
{
|
|
2170
|
+
name: f"{name}{suffix}"
|
|
2171
|
+
for name in right.column_names
|
|
2172
|
+
if name in left.column_names_set
|
|
2173
|
+
}
|
|
2174
|
+
)
|
|
2175
|
+
result = left_result.with_columns(right_result.columns, stream=stream)
|
|
2081
2176
|
|
|
2082
2177
|
return result.slice(zlice)
|
|
2083
2178
|
|
|
@@ -2328,162 +2423,162 @@ class Join(IR):
|
|
|
2328
2423
|
context: IRExecutionContext,
|
|
2329
2424
|
) -> DataFrame:
|
|
2330
2425
|
"""Evaluate and return a dataframe."""
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
stream=stream
|
|
2351
|
-
|
|
2352
|
-
|
|
2426
|
+
with context.stream_ordered_after(left, right) as stream:
|
|
2427
|
+
how, nulls_equal, zlice, suffix, coalesce, maintain_order = options
|
|
2428
|
+
if how == "Cross":
|
|
2429
|
+
# Separate implementation, since cross_join returns the
|
|
2430
|
+
# result, not the gather maps
|
|
2431
|
+
if right.num_rows == 0:
|
|
2432
|
+
left_cols = Join._build_columns(
|
|
2433
|
+
[], left.columns, empty=True, stream=stream
|
|
2434
|
+
)
|
|
2435
|
+
right_cols = Join._build_columns(
|
|
2436
|
+
[],
|
|
2437
|
+
right.columns,
|
|
2438
|
+
left=False,
|
|
2439
|
+
empty=True,
|
|
2440
|
+
rename=lambda name: name
|
|
2441
|
+
if name not in left.column_names_set
|
|
2442
|
+
else f"{name}{suffix}",
|
|
2443
|
+
stream=stream,
|
|
2444
|
+
)
|
|
2445
|
+
result = DataFrame([*left_cols, *right_cols], stream=stream)
|
|
2446
|
+
else:
|
|
2447
|
+
columns = plc.join.cross_join(
|
|
2448
|
+
left.table, right.table, stream=stream
|
|
2449
|
+
).columns()
|
|
2450
|
+
left_cols = Join._build_columns(
|
|
2451
|
+
columns[: left.num_columns], left.columns, stream=stream
|
|
2452
|
+
)
|
|
2453
|
+
right_cols = Join._build_columns(
|
|
2454
|
+
columns[left.num_columns :],
|
|
2455
|
+
right.columns,
|
|
2456
|
+
rename=lambda name: name
|
|
2457
|
+
if name not in left.column_names_set
|
|
2458
|
+
else f"{name}{suffix}",
|
|
2459
|
+
left=False,
|
|
2460
|
+
stream=stream,
|
|
2461
|
+
)
|
|
2462
|
+
result = DataFrame([*left_cols, *right_cols], stream=stream).slice(
|
|
2463
|
+
zlice
|
|
2464
|
+
)
|
|
2465
|
+
|
|
2353
2466
|
else:
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
right_cols = Join._build_columns(
|
|
2361
|
-
columns[left.num_columns :],
|
|
2362
|
-
right.columns,
|
|
2363
|
-
rename=lambda name: name
|
|
2364
|
-
if name not in left.column_names_set
|
|
2365
|
-
else f"{name}{suffix}",
|
|
2366
|
-
left=False,
|
|
2467
|
+
# how != "Cross"
|
|
2468
|
+
# TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
|
|
2469
|
+
left_on = DataFrame(
|
|
2470
|
+
broadcast(
|
|
2471
|
+
*(e.evaluate(left) for e in left_on_exprs), stream=stream
|
|
2472
|
+
),
|
|
2367
2473
|
stream=stream,
|
|
2368
2474
|
)
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
# how != "Cross"
|
|
2375
|
-
# TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
|
|
2376
|
-
left_on = DataFrame(
|
|
2377
|
-
broadcast(*(e.evaluate(left) for e in left_on_exprs), stream=stream),
|
|
2378
|
-
stream=stream,
|
|
2379
|
-
)
|
|
2380
|
-
right_on = DataFrame(
|
|
2381
|
-
broadcast(*(e.evaluate(right) for e in right_on_exprs), stream=stream),
|
|
2382
|
-
stream=stream,
|
|
2383
|
-
)
|
|
2384
|
-
null_equality = (
|
|
2385
|
-
plc.types.NullEquality.EQUAL
|
|
2386
|
-
if nulls_equal
|
|
2387
|
-
else plc.types.NullEquality.UNEQUAL
|
|
2388
|
-
)
|
|
2389
|
-
join_fn, left_policy, right_policy = cls._joiners(how)
|
|
2390
|
-
if right_policy is None:
|
|
2391
|
-
# Semi join
|
|
2392
|
-
lg = join_fn(left_on.table, right_on.table, null_equality, stream)
|
|
2393
|
-
table = plc.copying.gather(left.table, lg, left_policy, stream=stream)
|
|
2394
|
-
result = DataFrame.from_table(
|
|
2395
|
-
table, left.column_names, left.dtypes, stream=stream
|
|
2475
|
+
right_on = DataFrame(
|
|
2476
|
+
broadcast(
|
|
2477
|
+
*(e.evaluate(right) for e in right_on_exprs), stream=stream
|
|
2478
|
+
),
|
|
2479
|
+
stream=stream,
|
|
2396
2480
|
)
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
left_on, right_on = right_on, left_on
|
|
2402
|
-
maintain_order = Join.SWAPPED_ORDER[maintain_order]
|
|
2403
|
-
|
|
2404
|
-
lg, rg = join_fn(
|
|
2405
|
-
left_on.table, right_on.table, null_equality, stream=stream
|
|
2481
|
+
null_equality = (
|
|
2482
|
+
plc.types.NullEquality.EQUAL
|
|
2483
|
+
if nulls_equal
|
|
2484
|
+
else plc.types.NullEquality.UNEQUAL
|
|
2406
2485
|
)
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
left.
|
|
2413
|
-
lg,
|
|
2414
|
-
left_policy,
|
|
2415
|
-
right.num_rows,
|
|
2416
|
-
rg,
|
|
2417
|
-
right_policy,
|
|
2418
|
-
left_primary=maintain_order.startswith("left"),
|
|
2419
|
-
stream=stream,
|
|
2486
|
+
join_fn, left_policy, right_policy = cls._joiners(how)
|
|
2487
|
+
if right_policy is None:
|
|
2488
|
+
# Semi join
|
|
2489
|
+
lg = join_fn(left_on.table, right_on.table, null_equality, stream)
|
|
2490
|
+
table = plc.copying.gather(
|
|
2491
|
+
left.table, lg, left_policy, stream=stream
|
|
2420
2492
|
)
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2493
|
+
result = DataFrame.from_table(
|
|
2494
|
+
table, left.column_names, left.dtypes, stream=stream
|
|
2495
|
+
)
|
|
2496
|
+
else:
|
|
2497
|
+
if how == "Right":
|
|
2498
|
+
# Right join is a left join with the tables swapped
|
|
2499
|
+
left, right = right, left
|
|
2500
|
+
left_on, right_on = right_on, left_on
|
|
2501
|
+
maintain_order = Join.SWAPPED_ORDER[maintain_order]
|
|
2502
|
+
|
|
2503
|
+
lg, rg = join_fn(
|
|
2504
|
+
left_on.table, right_on.table, null_equality, stream=stream
|
|
2505
|
+
)
|
|
2506
|
+
if (
|
|
2507
|
+
how in ("Inner", "Left", "Right", "Full")
|
|
2508
|
+
and maintain_order != "none"
|
|
2509
|
+
):
|
|
2510
|
+
lg, rg = cls._reorder_maps(
|
|
2511
|
+
left.num_rows,
|
|
2512
|
+
lg,
|
|
2513
|
+
left_policy,
|
|
2514
|
+
right.num_rows,
|
|
2515
|
+
rg,
|
|
2516
|
+
right_policy,
|
|
2517
|
+
left_primary=maintain_order.startswith("left"),
|
|
2518
|
+
stream=stream,
|
|
2434
2519
|
)
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
right.column_names,
|
|
2446
|
-
right.dtypes,
|
|
2447
|
-
stream=stream,
|
|
2448
|
-
)
|
|
2449
|
-
if coalesce and how == "Full":
|
|
2450
|
-
left = left.with_columns(
|
|
2451
|
-
(
|
|
2452
|
-
Column(
|
|
2453
|
-
plc.replace.replace_nulls(
|
|
2454
|
-
left_col.obj, right_col.obj, stream=stream
|
|
2455
|
-
),
|
|
2456
|
-
name=left_col.name,
|
|
2457
|
-
dtype=left_col.dtype,
|
|
2520
|
+
if coalesce:
|
|
2521
|
+
if how == "Full":
|
|
2522
|
+
# In this case, keys must be column references,
|
|
2523
|
+
# possibly with dtype casting. We should use them in
|
|
2524
|
+
# preference to the columns from the original tables.
|
|
2525
|
+
|
|
2526
|
+
# We need to specify `stream` here. We know that `{left,right}_on`
|
|
2527
|
+
# is valid on `stream`, which is ordered after `{left,right}.stream`.
|
|
2528
|
+
left = left.with_columns(
|
|
2529
|
+
left_on.columns, replace_only=True, stream=stream
|
|
2458
2530
|
)
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
right.select_columns(right_on.column_names_set),
|
|
2462
|
-
strict=True,
|
|
2531
|
+
right = right.with_columns(
|
|
2532
|
+
right_on.columns, replace_only=True, stream=stream
|
|
2463
2533
|
)
|
|
2534
|
+
else:
|
|
2535
|
+
right = right.discard_columns(right_on.column_names_set)
|
|
2536
|
+
left = DataFrame.from_table(
|
|
2537
|
+
plc.copying.gather(left.table, lg, left_policy, stream=stream),
|
|
2538
|
+
left.column_names,
|
|
2539
|
+
left.dtypes,
|
|
2540
|
+
stream=stream,
|
|
2541
|
+
)
|
|
2542
|
+
right = DataFrame.from_table(
|
|
2543
|
+
plc.copying.gather(
|
|
2544
|
+
right.table, rg, right_policy, stream=stream
|
|
2464
2545
|
),
|
|
2465
|
-
|
|
2546
|
+
right.column_names,
|
|
2547
|
+
right.dtypes,
|
|
2466
2548
|
stream=stream,
|
|
2467
2549
|
)
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
|
|
2550
|
+
if coalesce and how == "Full":
|
|
2551
|
+
left = left.with_columns(
|
|
2552
|
+
(
|
|
2553
|
+
Column(
|
|
2554
|
+
plc.replace.replace_nulls(
|
|
2555
|
+
left_col.obj, right_col.obj, stream=stream
|
|
2556
|
+
),
|
|
2557
|
+
name=left_col.name,
|
|
2558
|
+
dtype=left_col.dtype,
|
|
2559
|
+
)
|
|
2560
|
+
for left_col, right_col in zip(
|
|
2561
|
+
left.select_columns(left_on.column_names_set),
|
|
2562
|
+
right.select_columns(right_on.column_names_set),
|
|
2563
|
+
strict=True,
|
|
2564
|
+
)
|
|
2565
|
+
),
|
|
2566
|
+
replace_only=True,
|
|
2567
|
+
stream=stream,
|
|
2568
|
+
)
|
|
2569
|
+
right = right.discard_columns(right_on.column_names_set)
|
|
2570
|
+
if how == "Right":
|
|
2571
|
+
# Undo the swap for right join before gluing together.
|
|
2572
|
+
left, right = right, left
|
|
2573
|
+
right = right.rename_columns(
|
|
2574
|
+
{
|
|
2575
|
+
name: f"{name}{suffix}"
|
|
2576
|
+
for name in right.column_names
|
|
2577
|
+
if name in left.column_names_set
|
|
2578
|
+
}
|
|
2579
|
+
)
|
|
2580
|
+
result = left.with_columns(right.columns, stream=stream)
|
|
2581
|
+
result = result.slice(zlice)
|
|
2487
2582
|
|
|
2488
2583
|
return result
|
|
2489
2584
|
|
|
@@ -2826,33 +2921,25 @@ class MergeSorted(IR):
|
|
|
2826
2921
|
cls, key: str, *dfs: DataFrame, context: IRExecutionContext
|
|
2827
2922
|
) -> DataFrame:
|
|
2828
2923
|
"""Evaluate and return a dataframe."""
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
2840
|
-
|
|
2841
|
-
|
|
2924
|
+
with context.stream_ordered_after(*dfs) as stream:
|
|
2925
|
+
left, right = dfs
|
|
2926
|
+
right = right.discard_columns(
|
|
2927
|
+
right.column_names_set - left.column_names_set
|
|
2928
|
+
)
|
|
2929
|
+
on_col_left = left.select_columns({key})[0]
|
|
2930
|
+
on_col_right = right.select_columns({key})[0]
|
|
2931
|
+
return DataFrame.from_table(
|
|
2932
|
+
plc.merge.merge(
|
|
2933
|
+
[right.table, left.table],
|
|
2934
|
+
[left.column_names.index(key), right.column_names.index(key)],
|
|
2935
|
+
[on_col_left.order, on_col_right.order],
|
|
2936
|
+
[on_col_left.null_order, on_col_right.null_order],
|
|
2937
|
+
stream=stream,
|
|
2938
|
+
),
|
|
2939
|
+
left.column_names,
|
|
2940
|
+
left.dtypes,
|
|
2842
2941
|
stream=stream,
|
|
2843
|
-
)
|
|
2844
|
-
left.column_names,
|
|
2845
|
-
left.dtypes,
|
|
2846
|
-
stream=stream,
|
|
2847
|
-
)
|
|
2848
|
-
|
|
2849
|
-
# Join the original streams back into the result stream to ensure that the
|
|
2850
|
-
# deallocations (on the original streams) happen after the result is ready
|
|
2851
|
-
join_cuda_streams(
|
|
2852
|
-
downstreams=[df.stream for df in dfs], upstreams=(result.stream,)
|
|
2853
|
-
)
|
|
2854
|
-
|
|
2855
|
-
return result
|
|
2942
|
+
)
|
|
2856
2943
|
|
|
2857
2944
|
|
|
2858
2945
|
class MapFunction(IR):
|
|
@@ -2928,6 +3015,8 @@ class MapFunction(IR):
|
|
|
2928
3015
|
)
|
|
2929
3016
|
elif self.name == "row_index":
|
|
2930
3017
|
col_name, offset = options
|
|
3018
|
+
if col_name in df.schema:
|
|
3019
|
+
raise NotImplementedError("Duplicate row index name")
|
|
2931
3020
|
self.options = (col_name, offset)
|
|
2932
3021
|
elif self.name == "fast_count":
|
|
2933
3022
|
# TODO: Remove this once all scan types support projections
|
|
@@ -3078,26 +3167,14 @@ class Union(IR):
|
|
|
3078
3167
|
cls, zlice: Zlice | None, *dfs: DataFrame, context: IRExecutionContext
|
|
3079
3168
|
) -> DataFrame:
|
|
3080
3169
|
"""Evaluate and return a dataframe."""
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
dfs[0].dtypes,
|
|
3090
|
-
stream=stream,
|
|
3091
|
-
).slice(zlice)
|
|
3092
|
-
|
|
3093
|
-
# now join the original streams *back* to the new result stream
|
|
3094
|
-
# to ensure that the deallocations (on the original streams)
|
|
3095
|
-
# happen after the result is ready
|
|
3096
|
-
join_cuda_streams(
|
|
3097
|
-
downstreams=[df.stream for df in dfs], upstreams=(result.stream,)
|
|
3098
|
-
)
|
|
3099
|
-
|
|
3100
|
-
return result
|
|
3170
|
+
with context.stream_ordered_after(*dfs) as stream:
|
|
3171
|
+
# TODO: only evaluate what we need if we have a slice?
|
|
3172
|
+
return DataFrame.from_table(
|
|
3173
|
+
plc.concatenate.concatenate([df.table for df in dfs], stream=stream),
|
|
3174
|
+
dfs[0].column_names,
|
|
3175
|
+
dfs[0].dtypes,
|
|
3176
|
+
stream=stream,
|
|
3177
|
+
).slice(zlice)
|
|
3101
3178
|
|
|
3102
3179
|
|
|
3103
3180
|
class HConcat(IR):
|
|
@@ -3160,48 +3237,41 @@ class HConcat(IR):
|
|
|
3160
3237
|
context: IRExecutionContext,
|
|
3161
3238
|
) -> DataFrame:
|
|
3162
3239
|
"""Evaluate and return a dataframe."""
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
*itertools.chain.from_iterable(df.columns for df in dfs),
|
|
3240
|
+
with context.stream_ordered_after(*dfs) as stream:
|
|
3241
|
+
# Special should_broadcast case.
|
|
3242
|
+
# Used to recombine decomposed expressions
|
|
3243
|
+
if should_broadcast:
|
|
3244
|
+
result = DataFrame(
|
|
3245
|
+
broadcast(
|
|
3246
|
+
*itertools.chain.from_iterable(df.columns for df in dfs),
|
|
3247
|
+
stream=stream,
|
|
3248
|
+
),
|
|
3173
3249
|
stream=stream,
|
|
3174
|
-
)
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3250
|
+
)
|
|
3251
|
+
else:
|
|
3252
|
+
max_rows = max(df.num_rows for df in dfs)
|
|
3253
|
+
# Horizontal concatenation extends shorter tables with nulls
|
|
3254
|
+
result = DataFrame(
|
|
3255
|
+
itertools.chain.from_iterable(
|
|
3256
|
+
df.columns
|
|
3257
|
+
for df in (
|
|
3258
|
+
df
|
|
3259
|
+
if df.num_rows == max_rows
|
|
3260
|
+
else DataFrame.from_table(
|
|
3261
|
+
cls._extend_with_nulls(
|
|
3262
|
+
df.table,
|
|
3263
|
+
nrows=max_rows - df.num_rows,
|
|
3264
|
+
stream=stream,
|
|
3265
|
+
),
|
|
3266
|
+
df.column_names,
|
|
3267
|
+
df.dtypes,
|
|
3268
|
+
stream=stream,
|
|
3269
|
+
)
|
|
3270
|
+
for df in dfs
|
|
3193
3271
|
)
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
)
|
|
3197
|
-
stream=stream,
|
|
3198
|
-
)
|
|
3199
|
-
|
|
3200
|
-
# Join the original streams back into the result stream to ensure that the
|
|
3201
|
-
# deallocations (on the original streams) happen after the result is ready
|
|
3202
|
-
join_cuda_streams(
|
|
3203
|
-
downstreams=[df.stream for df in dfs], upstreams=(result.stream,)
|
|
3204
|
-
)
|
|
3272
|
+
),
|
|
3273
|
+
stream=stream,
|
|
3274
|
+
)
|
|
3205
3275
|
|
|
3206
3276
|
return result
|
|
3207
3277
|
|