dask-cuda 24.6.0__py3-none-any.whl → 24.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dask_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 24.06.00
1
+ 24.08.02
dask_cuda/_version.py CHANGED
@@ -15,6 +15,16 @@
15
15
  import importlib.resources
16
16
 
17
17
  __version__ = (
18
- importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
18
+ importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
19
19
  )
20
- __git_commit__ = "2fc151b061e90fae0cf95b45dbd62507aa8dd7e6"
20
+ try:
21
+ __git_commit__ = (
22
+ importlib.resources.files(__package__)
23
+ .joinpath("GIT_COMMIT")
24
+ .read_text()
25
+ .strip()
26
+ )
27
+ except FileNotFoundError:
28
+ __git_commit__ = ""
29
+
30
+ __all__ = ["__git_commit__", "__version__"]
@@ -117,16 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
117
117
  wait_for_cluster(client, shutdown_on_failure=True)
118
118
  assert len(client.scheduler_info()["workers"]) > 0
119
119
  setup_memory_pools(
120
- client,
121
- args.type == "gpu",
122
- args.rmm_pool_size,
123
- args.disable_rmm_pool,
124
- args.enable_rmm_async,
125
- args.enable_rmm_managed,
126
- args.rmm_release_threshold,
127
- args.rmm_log_directory,
128
- args.enable_rmm_statistics,
129
- args.enable_rmm_track_allocations,
120
+ client=client,
121
+ is_gpu=args.type == "gpu",
122
+ disable_rmm=args.disable_rmm,
123
+ disable_rmm_pool=args.disable_rmm_pool,
124
+ pool_size=args.rmm_pool_size,
125
+ maximum_pool_size=args.rmm_maximum_pool_size,
126
+ rmm_async=args.enable_rmm_async,
127
+ rmm_managed=args.enable_rmm_managed,
128
+ release_threshold=args.rmm_release_threshold,
129
+ log_directory=args.rmm_log_directory,
130
+ statistics=args.enable_rmm_statistics,
131
+ rmm_track_allocations=args.enable_rmm_track_allocations,
130
132
  )
131
133
  address_to_index, results, message_data = gather_bench_results(client, args, config)
132
134
  p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
@@ -17,6 +17,7 @@ from dask.utils import format_bytes, format_time, parse_bytes
17
17
  from distributed.comm.addressing import get_address_host
18
18
 
19
19
  from dask_cuda.local_cuda_cluster import LocalCUDACluster
20
+ from dask_cuda.utils import parse_device_memory_limit
20
21
 
21
22
 
22
23
  def as_noop(dsk):
@@ -93,15 +94,41 @@ def parse_benchmark_args(
93
94
  "'forkserver' can be used to avoid issues with fork not being allowed "
94
95
  "after the networking stack has been initialised.",
95
96
  )
97
+ cluster_args.add_argument(
98
+ "--disable-rmm",
99
+ action="store_true",
100
+ help="Disable RMM.",
101
+ )
102
+ cluster_args.add_argument(
103
+ "--disable-rmm-pool",
104
+ action="store_true",
105
+ help="Uses RMM for allocations but without a memory pool.",
106
+ )
96
107
  cluster_args.add_argument(
97
108
  "--rmm-pool-size",
98
109
  default=None,
99
110
  type=parse_bytes,
100
111
  help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
101
- "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
112
+ "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
113
+ ""
114
+ ".. note::"
115
+ " This size is a per-worker configuration, and not cluster-wide.",
102
116
  )
103
117
  cluster_args.add_argument(
104
- "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
118
+ "--rmm-maximum-pool-size",
119
+ default=None,
120
+ help="When ``--rmm-pool-size`` is specified, this argument indicates the "
121
+ "maximum pool size. Can be an integer (bytes), or a string (like '4GB' or "
122
+ "'5000M'). By default, the total available memory on the GPU is used. "
123
+ "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
124
+ "pool size."
125
+ ""
126
+ ".. note::"
127
+ " When paired with `--enable-rmm-async` the maximum size cannot be "
128
+ " guaranteed due to fragmentation."
129
+ ""
130
+ ".. note::"
131
+ " This size is a per-worker configuration, and not cluster-wide.",
105
132
  )
106
133
  cluster_args.add_argument(
107
134
  "--enable-rmm-managed",
@@ -407,10 +434,29 @@ def get_worker_device():
407
434
  return -1
408
435
 
409
436
 
437
+ def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
438
+ import cupy
439
+
440
+ import rmm
441
+ from rmm.allocators.cupy import rmm_cupy_allocator
442
+
443
+ cupy.cuda.set_allocator(rmm_cupy_allocator)
444
+ if statistics:
445
+ rmm.mr.set_current_device_resource(
446
+ rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
447
+ )
448
+ if rmm_track_allocations:
449
+ rmm.mr.set_current_device_resource(
450
+ rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
451
+ )
452
+
453
+
410
454
  def setup_memory_pool(
411
455
  dask_worker=None,
456
+ disable_rmm=None,
457
+ disable_rmm_pool=None,
412
458
  pool_size=None,
413
- disable_pool=False,
459
+ maximum_pool_size=None,
414
460
  rmm_async=False,
415
461
  rmm_managed=False,
416
462
  release_threshold=None,
@@ -418,45 +464,66 @@ def setup_memory_pool(
418
464
  statistics=False,
419
465
  rmm_track_allocations=False,
420
466
  ):
421
- import cupy
422
-
423
467
  import rmm
424
- from rmm.allocators.cupy import rmm_cupy_allocator
425
468
 
426
469
  from dask_cuda.utils import get_rmm_log_file_name
427
470
 
428
471
  logging = log_directory is not None
429
472
 
430
- if rmm_async:
431
- rmm.mr.set_current_device_resource(
432
- rmm.mr.CudaAsyncMemoryResource(
433
- initial_pool_size=pool_size, release_threshold=release_threshold
434
- )
435
- )
436
- else:
437
- rmm.reinitialize(
438
- pool_allocator=not disable_pool,
439
- managed_memory=rmm_managed,
440
- initial_pool_size=pool_size,
441
- logging=logging,
442
- log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
443
- )
444
- cupy.cuda.set_allocator(rmm_cupy_allocator)
445
- if statistics:
446
- rmm.mr.set_current_device_resource(
447
- rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
473
+ if pool_size is not None:
474
+ pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
475
+
476
+ if maximum_pool_size is not None:
477
+ maximum_pool_size = parse_device_memory_limit(
478
+ maximum_pool_size, alignment_size=256
448
479
  )
449
- if rmm_track_allocations:
450
- rmm.mr.set_current_device_resource(
451
- rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
480
+
481
+ if release_threshold is not None:
482
+ release_threshold = parse_device_memory_limit(
483
+ release_threshold, alignment_size=256
452
484
  )
453
485
 
486
+ if not disable_rmm:
487
+ if rmm_async:
488
+ mr = rmm.mr.CudaAsyncMemoryResource(
489
+ initial_pool_size=pool_size,
490
+ release_threshold=release_threshold,
491
+ )
492
+
493
+ if maximum_pool_size is not None:
494
+ mr = rmm.mr.LimitingResourceAdaptor(
495
+ mr, allocation_limit=maximum_pool_size
496
+ )
497
+
498
+ rmm.mr.set_current_device_resource(mr)
499
+
500
+ setup_rmm_resources(
501
+ statistics=statistics, rmm_track_allocations=rmm_track_allocations
502
+ )
503
+ else:
504
+ rmm.reinitialize(
505
+ pool_allocator=not disable_rmm_pool,
506
+ managed_memory=rmm_managed,
507
+ initial_pool_size=pool_size,
508
+ maximum_pool_size=maximum_pool_size,
509
+ logging=logging,
510
+ log_file_name=get_rmm_log_file_name(
511
+ dask_worker, logging, log_directory
512
+ ),
513
+ )
514
+
515
+ setup_rmm_resources(
516
+ statistics=statistics, rmm_track_allocations=rmm_track_allocations
517
+ )
518
+
454
519
 
455
520
  def setup_memory_pools(
456
521
  client,
457
522
  is_gpu,
523
+ disable_rmm,
524
+ disable_rmm_pool,
458
525
  pool_size,
459
- disable_pool,
526
+ maximum_pool_size,
460
527
  rmm_async,
461
528
  rmm_managed,
462
529
  release_threshold,
@@ -468,8 +535,10 @@ def setup_memory_pools(
468
535
  return
469
536
  client.run(
470
537
  setup_memory_pool,
538
+ disable_rmm=disable_rmm,
539
+ disable_rmm_pool=disable_rmm_pool,
471
540
  pool_size=pool_size,
472
- disable_pool=disable_pool,
541
+ maximum_pool_size=maximum_pool_size,
473
542
  rmm_async=rmm_async,
474
543
  rmm_managed=rmm_managed,
475
544
  release_threshold=release_threshold,
@@ -482,7 +551,9 @@ def setup_memory_pools(
482
551
  client.run_on_scheduler(
483
552
  setup_memory_pool,
484
553
  pool_size=1e9,
485
- disable_pool=disable_pool,
554
+ disable_rmm=disable_rmm,
555
+ disable_rmm_pool=disable_rmm_pool,
556
+ maximum_pool_size=maximum_pool_size,
486
557
  rmm_async=rmm_async,
487
558
  rmm_managed=rmm_managed,
488
559
  release_threshold=release_threshold,
dask_cuda/cli.py CHANGED
@@ -101,6 +101,20 @@ def cuda():
101
101
  total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
102
102
  disable spilling to host (i.e. allow full device memory usage).""",
103
103
  )
104
+ @click.option(
105
+ "--enable-cudf-spill/--disable-cudf-spill",
106
+ default=False,
107
+ show_default=True,
108
+ help="""Enable automatic cuDF spilling. WARNING: This should NOT be used with
109
+ JIT-Unspill.""",
110
+ )
111
+ @click.option(
112
+ "--cudf-spill-stats",
113
+ type=int,
114
+ default=0,
115
+ help="""Set the cuDF spilling statistics level. This option has no effect if
116
+ `--enable-cudf-spill` is not specified.""",
117
+ )
104
118
  @click.option(
105
119
  "--rmm-pool-size",
106
120
  default=None,
@@ -120,6 +134,10 @@ def cuda():
120
134
  memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
121
135
  to set the maximum pool size.
122
136
 
137
+ .. note::
138
+ When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
139
+ to fragmentation.
140
+
123
141
  .. note::
124
142
  This size is a per-worker configuration, and not cluster-wide.""",
125
143
  )
@@ -326,6 +344,8 @@ def worker(
326
344
  name,
327
345
  memory_limit,
328
346
  device_memory_limit,
347
+ enable_cudf_spill,
348
+ cudf_spill_stats,
329
349
  rmm_pool_size,
330
350
  rmm_maximum_pool_size,
331
351
  rmm_managed_memory,
@@ -398,6 +418,8 @@ def worker(
398
418
  name,
399
419
  memory_limit,
400
420
  device_memory_limit,
421
+ enable_cudf_spill,
422
+ cudf_spill_stats,
401
423
  rmm_pool_size,
402
424
  rmm_maximum_pool_size,
403
425
  rmm_managed_memory,
dask_cuda/cuda_worker.py CHANGED
@@ -20,7 +20,7 @@ from distributed.worker_memory import parse_memory_limit
20
20
 
21
21
  from .device_host_file import DeviceHostFile
22
22
  from .initialize import initialize
23
- from .plugins import CPUAffinity, PreImport, RMMSetup
23
+ from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
24
24
  from .proxify_host_file import ProxifyHostFile
25
25
  from .utils import (
26
26
  cuda_visible_devices,
@@ -41,6 +41,8 @@ class CUDAWorker(Server):
41
41
  name=None,
42
42
  memory_limit="auto",
43
43
  device_memory_limit="auto",
44
+ enable_cudf_spill=False,
45
+ cudf_spill_stats=0,
44
46
  rmm_pool_size=None,
45
47
  rmm_maximum_pool_size=None,
46
48
  rmm_managed_memory=False,
@@ -166,6 +168,12 @@ class CUDAWorker(Server):
166
168
  if device_memory_limit is None and memory_limit is None:
167
169
  data = lambda _: {}
168
170
  elif jit_unspill:
171
+ if enable_cudf_spill:
172
+ warnings.warn(
173
+ "Enabling cuDF spilling and JIT-Unspill together is not "
174
+ "safe, consider disabling JIT-Unspill."
175
+ )
176
+
169
177
  data = lambda i: (
170
178
  ProxifyHostFile,
171
179
  {
@@ -217,6 +225,7 @@ class CUDAWorker(Server):
217
225
  track_allocations=rmm_track_allocations,
218
226
  ),
219
227
  PreImport(pre_import),
228
+ CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
220
229
  },
221
230
  name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
222
231
  local_directory=local_directory,
@@ -8,6 +8,9 @@ from math import ceil
8
8
  from operator import getitem
9
9
  from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
10
10
 
11
+ import numpy as np
12
+ import pandas as pd
13
+
11
14
  import dask
12
15
  import dask.config
13
16
  import dask.dataframe
@@ -155,9 +158,16 @@ def compute_map_index(
155
158
  if column_names[0] == "_partitions":
156
159
  ind = df[column_names[0]]
157
160
  else:
158
- ind = hash_object_dispatch(
159
- df[column_names] if column_names else df, index=False
160
- )
161
+ # Need to cast numerical dtypes to be consistent
162
+ # with `dask.dataframe.shuffle.partitioning_index`
163
+ dtypes = {}
164
+ index = df[column_names] if column_names else df
165
+ for col, dtype in index.dtypes.items():
166
+ if pd.api.types.is_numeric_dtype(dtype):
167
+ dtypes[col] = np.float64
168
+ if dtypes:
169
+ index = index.astype(dtypes, errors="ignore")
170
+ ind = hash_object_dispatch(index, index=False)
161
171
  return ind % npartitions
162
172
 
163
173
 
@@ -187,15 +197,8 @@ def partition_dataframe(
187
197
  partitions
188
198
  Dict of dataframe-partitions, mapping partition-ID to dataframe
189
199
  """
190
- if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
191
- return dict(
192
- zip(
193
- range(npartitions),
194
- df.partition_by_hash(
195
- column_names, npartitions, keep_index=not ignore_index
196
- ),
197
- )
198
- )
200
+ # TODO: Use `partition_by_hash` if/when dtype-casting is added
201
+ # (See: https://github.com/rapidsai/cudf/issues/16221)
199
202
  map_index = compute_map_index(df, column_names, npartitions)
200
203
  return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
201
204
 
@@ -529,18 +532,19 @@ def shuffle(
529
532
  # TODO: can we do this without using `submit()` to avoid the overhead
530
533
  # of creating a Future for each dataframe partition?
531
534
 
532
- futures = []
535
+ _futures = {}
533
536
  for rank in ranks:
534
537
  for part_id in rank_to_out_part_ids[rank]:
535
- futures.append(
536
- c.client.submit(
537
- getitem,
538
- shuffle_result[rank],
539
- part_id,
540
- workers=[c.worker_addresses[rank]],
541
- )
538
+ _futures[part_id] = c.client.submit(
539
+ getitem,
540
+ shuffle_result[rank],
541
+ part_id,
542
+ workers=[c.worker_addresses[rank]],
542
543
  )
543
544
 
545
+ # Make sure partitions are properly ordered
546
+ futures = [_futures.pop(i) for i in range(npartitions)]
547
+
544
548
  # Create a distributed Dataframe from all the pieces
545
549
  divs = [None] * (len(futures) + 1)
546
550
  kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
@@ -10,7 +10,7 @@ from distributed.worker_memory import parse_memory_limit
10
10
 
11
11
  from .device_host_file import DeviceHostFile
12
12
  from .initialize import initialize
13
- from .plugins import CPUAffinity, PreImport, RMMSetup
13
+ from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
14
14
  from .proxify_host_file import ProxifyHostFile
15
15
  from .utils import (
16
16
  cuda_visible_devices,
@@ -73,6 +73,14 @@ class LocalCUDACluster(LocalCluster):
73
73
  starts spilling to host memory. Can be an integer (bytes), float (fraction of
74
74
  total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
75
75
  or ``None`` to disable spilling to host (i.e. allow full device memory usage).
76
+ enable_cudf_spill : bool, default False
77
+ Enable automatic cuDF spilling.
78
+
79
+ .. warning::
80
+ This should NOT be used together with JIT-Unspill.
81
+ cudf_spill_stats : int, default 0
82
+ Set the cuDF spilling statistics level. This option has no effect if
83
+ ``enable_cudf_spill=False``.
76
84
  local_directory : str or None, default None
77
85
  Path on local machine to store temporary files. Can be a string (like
78
86
  ``"path/to/files"``) or ``None`` to fall back on the value of
@@ -114,6 +122,10 @@ class LocalCUDACluster(LocalCluster):
114
122
  memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
115
123
  and to set the maximum pool size.
116
124
 
125
+ .. note::
126
+ When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
127
+ due to fragmentation.
128
+
117
129
  .. note::
118
130
  This size is a per-worker configuration, and not cluster-wide.
119
131
  rmm_managed_memory : bool, default False
@@ -205,6 +217,8 @@ class LocalCUDACluster(LocalCluster):
205
217
  threads_per_worker=1,
206
218
  memory_limit="auto",
207
219
  device_memory_limit=0.8,
220
+ enable_cudf_spill=False,
221
+ cudf_spill_stats=0,
208
222
  data=None,
209
223
  local_directory=None,
210
224
  shared_filesystem=None,
@@ -255,6 +269,8 @@ class LocalCUDACluster(LocalCluster):
255
269
  self.device_memory_limit = parse_device_memory_limit(
256
270
  device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
257
271
  )
272
+ self.enable_cudf_spill = enable_cudf_spill
273
+ self.cudf_spill_stats = cudf_spill_stats
258
274
 
259
275
  self.rmm_pool_size = rmm_pool_size
260
276
  self.rmm_maximum_pool_size = rmm_maximum_pool_size
@@ -298,6 +314,12 @@ class LocalCUDACluster(LocalCluster):
298
314
  if device_memory_limit is None and memory_limit is None:
299
315
  data = {}
300
316
  elif jit_unspill:
317
+ if enable_cudf_spill:
318
+ warnings.warn(
319
+ "Enabling cuDF spilling and JIT-Unspill together is not "
320
+ "safe, consider disabling JIT-Unspill."
321
+ )
322
+
301
323
  data = (
302
324
  ProxifyHostFile,
303
325
  {
@@ -410,6 +432,7 @@ class LocalCUDACluster(LocalCluster):
410
432
  track_allocations=self.rmm_track_allocations,
411
433
  ),
412
434
  PreImport(self.pre_import),
435
+ CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
413
436
  },
414
437
  }
415
438
  )
dask_cuda/plugins.py CHANGED
@@ -14,6 +14,21 @@ class CPUAffinity(WorkerPlugin):
14
14
  os.sched_setaffinity(0, self.cores)
15
15
 
16
16
 
17
+ class CUDFSetup(WorkerPlugin):
18
+ def __init__(self, spill, spill_stats):
19
+ self.spill = spill
20
+ self.spill_stats = spill_stats
21
+
22
+ def setup(self, worker=None):
23
+ try:
24
+ import cudf
25
+
26
+ cudf.set_option("spill", self.spill)
27
+ cudf.set_option("spill_stats", self.spill_stats)
28
+ except ImportError:
29
+ pass
30
+
31
+
17
32
  class RMMSetup(WorkerPlugin):
18
33
  def __init__(
19
34
  self,
@@ -20,7 +20,7 @@ from cudf.core.buffer.spill_manager import ( # noqa: E402
20
20
  get_global_manager,
21
21
  set_global_manager,
22
22
  )
23
- from cudf.testing._utils import assert_eq # noqa: E402
23
+ from cudf.testing import assert_eq # noqa: E402
24
24
 
25
25
  if get_global_manager() is not None:
26
26
  pytest.skip(
@@ -231,6 +231,64 @@ def test_rmm_logging(loop): # noqa: F811
231
231
  assert v is rmm.mr.LoggingResourceAdaptor
232
232
 
233
233
 
234
+ def test_cudf_spill_disabled(loop): # noqa: F811
235
+ cudf = pytest.importorskip("cudf")
236
+ with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
237
+ with popen(
238
+ [
239
+ "dask",
240
+ "cuda",
241
+ "worker",
242
+ "127.0.0.1:9369",
243
+ "--host",
244
+ "127.0.0.1",
245
+ "--no-dashboard",
246
+ ]
247
+ ):
248
+ with Client("127.0.0.1:9369", loop=loop) as client:
249
+ assert wait_workers(client, n_gpus=get_n_gpus())
250
+
251
+ cudf_spill = client.run(
252
+ cudf.get_option,
253
+ "spill",
254
+ )
255
+ for v in cudf_spill.values():
256
+ assert v is False
257
+
258
+ cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
259
+ for v in cudf_spill_stats.values():
260
+ assert v == 0
261
+
262
+
263
+ def test_cudf_spill(loop): # noqa: F811
264
+ cudf = pytest.importorskip("cudf")
265
+ with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
266
+ with popen(
267
+ [
268
+ "dask",
269
+ "cuda",
270
+ "worker",
271
+ "127.0.0.1:9369",
272
+ "--host",
273
+ "127.0.0.1",
274
+ "--no-dashboard",
275
+ "--enable-cudf-spill",
276
+ "--cudf-spill-stats",
277
+ "2",
278
+ ]
279
+ ):
280
+ with Client("127.0.0.1:9369", loop=loop) as client:
281
+ assert wait_workers(client, n_gpus=get_n_gpus())
282
+
283
+ cudf_spill = client.run(cudf.get_option, "spill")
284
+ for v in cudf_spill.values():
285
+ assert v is True
286
+
287
+ cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
288
+ for v in cudf_spill_stats.values():
289
+ assert v == 2
290
+
291
+
234
292
  @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
235
293
  def test_dashboard_address(loop): # noqa: F811
236
294
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
@@ -109,7 +109,14 @@ def test_dataframe_merge_empty_partitions():
109
109
 
110
110
  def check_partitions(df, npartitions):
111
111
  """Check that all values in `df` hashes to the same"""
112
- hashes = partitioning_index(df, npartitions)
112
+ dtypes = {}
113
+ for col, dtype in df.dtypes.items():
114
+ if pd.api.types.is_numeric_dtype(dtype):
115
+ dtypes[col] = np.float64
116
+ if not dtypes:
117
+ dtypes = None
118
+
119
+ hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
113
120
  if len(hashes) > 0:
114
121
  return len(hashes.unique()) == 1
115
122
  else:
@@ -128,11 +135,10 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
128
135
  worker_class=IncreasedCloseTimeoutNanny,
129
136
  processes=True,
130
137
  ) as cluster:
131
- with Client(cluster) as client:
132
- all_workers = list(client.get_worker_logs().keys())
138
+ with Client(cluster):
133
139
  comms.default_comms()
134
140
  np.random.seed(42)
135
- df = pd.DataFrame({"key": np.random.random(100)})
141
+ df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
136
142
  if backend == "cudf":
137
143
  df = cudf.DataFrame.from_pandas(df)
138
144
 
@@ -141,15 +147,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
141
147
 
142
148
  for input_nparts in range(1, 5):
143
149
  for output_nparts in range(1, 5):
144
- ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
145
- workers=all_workers
146
- )
150
+ ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
147
151
  # To reduce test runtime, we change the batchsizes here instead
148
152
  # of using a test parameter.
149
153
  for batchsize in (-1, 1, 2):
150
154
  with dask.config.set(explicit_comms_batchsize=batchsize):
151
155
  ddf = explicit_comms_shuffle(
152
- ddf,
156
+ ddf1,
153
157
  ["_partitions"] if _partitions else ["key"],
154
158
  npartitions=output_nparts,
155
159
  batchsize=batchsize,
@@ -177,6 +181,32 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
177
181
  got = ddf.compute().sort_values("key")
178
182
  assert_eq(got, expected)
179
183
 
184
+ # Check that partitioning is consistent with "tasks"
185
+ ddf_tasks = ddf1.shuffle(
186
+ ["key"],
187
+ npartitions=output_nparts,
188
+ shuffle_method="tasks",
189
+ )
190
+ for i in range(output_nparts):
191
+ expected_partition = ddf_tasks.partitions[
192
+ i
193
+ ].compute()["key"]
194
+ actual_partition = ddf.partitions[i].compute()[
195
+ "key"
196
+ ]
197
+ if backend == "cudf":
198
+ expected_partition = (
199
+ expected_partition.values_host
200
+ )
201
+ actual_partition = actual_partition.values_host
202
+ else:
203
+ expected_partition = expected_partition.values
204
+ actual_partition = actual_partition.values
205
+ assert all(
206
+ np.sort(expected_partition)
207
+ == np.sort(actual_partition)
208
+ )
209
+
180
210
 
181
211
  @pytest.mark.parametrize("nworkers", [1, 2, 3])
182
212
  @pytest.mark.parametrize("backend", ["pandas", "cudf"])
@@ -500,6 +500,54 @@ async def test_worker_fraction_limits():
500
500
  )
501
501
 
502
502
 
503
+ @gen_test(timeout=20)
504
+ async def test_cudf_spill_disabled():
505
+ cudf = pytest.importorskip("cudf")
506
+
507
+ async with LocalCUDACluster(
508
+ asynchronous=True,
509
+ ) as cluster:
510
+ async with Client(cluster, asynchronous=True) as client:
511
+ cudf_spill = await client.run(
512
+ cudf.get_option,
513
+ "spill",
514
+ )
515
+ for v in cudf_spill.values():
516
+ assert v is False
517
+
518
+ cudf_spill_stats = await client.run(
519
+ cudf.get_option,
520
+ "spill_stats",
521
+ )
522
+ for v in cudf_spill_stats.values():
523
+ assert v == 0
524
+
525
+
526
+ @gen_test(timeout=20)
527
+ async def test_cudf_spill():
528
+ cudf = pytest.importorskip("cudf")
529
+
530
+ async with LocalCUDACluster(
531
+ enable_cudf_spill=True,
532
+ cudf_spill_stats=2,
533
+ asynchronous=True,
534
+ ) as cluster:
535
+ async with Client(cluster, asynchronous=True) as client:
536
+ cudf_spill = await client.run(
537
+ cudf.get_option,
538
+ "spill",
539
+ )
540
+ for v in cudf_spill.values():
541
+ assert v is True
542
+
543
+ cudf_spill_stats = await client.run(
544
+ cudf.get_option,
545
+ "spill_stats",
546
+ )
547
+ for v in cudf_spill_stats.values():
548
+ assert v == 2
549
+
550
+
503
551
  @pytest.mark.parametrize(
504
552
  "protocol",
505
553
  ["ucx", "ucxx"],
@@ -0,0 +1,12 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION.
2
+
3
+ import dask_cuda
4
+
5
+
6
+ def test_version_constants_are_populated():
7
+ # __git_commit__ will only be non-empty in a built distribution
8
+ assert isinstance(dask_cuda.__git_commit__, str)
9
+
10
+ # __version__ should always be non-empty
11
+ assert isinstance(dask_cuda.__version__, str)
12
+ assert len(dask_cuda.__version__) > 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dask-cuda
3
- Version: 24.6.0
3
+ Version: 24.8.2
4
4
  Summary: Utilities for Dask and CUDA interactions
5
5
  Author: NVIDIA Corporation
6
6
  License: Apache 2.0
@@ -18,25 +18,25 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Requires-Python: >=3.9
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: click >=8.1
22
- Requires-Dist: numba >=0.57
23
- Requires-Dist: numpy <2.0a0,>=1.23
24
- Requires-Dist: pandas >=1.3
25
- Requires-Dist: pynvml <11.5,>=11.0.0
26
- Requires-Dist: rapids-dask-dependency ==24.6.*
27
- Requires-Dist: zict >=2.0.0
21
+ Requires-Dist: click>=8.1
22
+ Requires-Dist: numba>=0.57
23
+ Requires-Dist: numpy<2.0a0,>=1.23
24
+ Requires-Dist: pandas>=1.3
25
+ Requires-Dist: pynvml<11.5,>=11.0.0
26
+ Requires-Dist: rapids-dask-dependency==24.8.*
27
+ Requires-Dist: zict>=2.0.0
28
28
  Provides-Extra: docs
29
- Requires-Dist: numpydoc >=1.1.0 ; extra == 'docs'
30
- Requires-Dist: sphinx ; extra == 'docs'
31
- Requires-Dist: sphinx-click >=2.7.1 ; extra == 'docs'
32
- Requires-Dist: sphinx-rtd-theme >=0.5.1 ; extra == 'docs'
29
+ Requires-Dist: numpydoc>=1.1.0; extra == "docs"
30
+ Requires-Dist: sphinx; extra == "docs"
31
+ Requires-Dist: sphinx-click>=2.7.1; extra == "docs"
32
+ Requires-Dist: sphinx-rtd-theme>=0.5.1; extra == "docs"
33
33
  Provides-Extra: test
34
- Requires-Dist: cudf ==24.6.* ; extra == 'test'
35
- Requires-Dist: dask-cudf ==24.6.* ; extra == 'test'
36
- Requires-Dist: kvikio ==24.6.* ; extra == 'test'
37
- Requires-Dist: pytest ; extra == 'test'
38
- Requires-Dist: pytest-cov ; extra == 'test'
39
- Requires-Dist: ucx-py ==0.38.* ; extra == 'test'
34
+ Requires-Dist: cudf==24.8.*; extra == "test"
35
+ Requires-Dist: dask-cudf==24.8.*; extra == "test"
36
+ Requires-Dist: kvikio==24.8.*; extra == "test"
37
+ Requires-Dist: pytest; extra == "test"
38
+ Requires-Dist: pytest-cov; extra == "test"
39
+ Requires-Dist: ucx-py==0.39.*; extra == "test"
40
40
 
41
41
  Dask CUDA
42
42
  =========
@@ -1,16 +1,16 @@
1
- dask_cuda/VERSION,sha256=dIWV5q3UAaQInFeBt7NGhhmqTBqP_0Y540pyLeZ8mkc,9
1
+ dask_cuda/VERSION,sha256=5YtjwV2EoD7E5Ed4K-PvnU0eEtdkkn33JHuNFDy8oKA,8
2
2
  dask_cuda/__init__.py,sha256=JLDWev7vI_dPusLgRdOwXBz-xfhlX_hc-DzmLtrEYO0,1918
3
- dask_cuda/_version.py,sha256=U6CHD0Kkafws8nJSbEwZcu-ZKReghzbciFgluwauXtg,778
4
- dask_cuda/cli.py,sha256=XNRH0bu-6jzRoyWJB5qSWuzePJSh3z_5Ng6rDCnz7lg,15970
5
- dask_cuda/cuda_worker.py,sha256=bIu-ESeIpJG_WaTYrv0z9z5juJ1qR5i_5Ng3CN1WK8s,8579
3
+ dask_cuda/_version.py,sha256=cHDO9AzNtxkCVhwYu7hL3H7RPAkQnxpKBjElOst3rkI,964
4
+ dask_cuda/cli.py,sha256=Y3aObfAyMwOIo0oVz3-NC2InGLShOpeINwW5ROTF2s8,16616
5
+ dask_cuda/cuda_worker.py,sha256=uqyoDKsSe7sKN3StMVyz_971rj0Sjpmwfv7Bj083Wss,8959
6
6
  dask_cuda/device_host_file.py,sha256=yS31LGtt9VFAG78uBBlTDr7HGIng2XymV1OxXIuEMtM,10272
7
7
  dask_cuda/disk_io.py,sha256=urSLKiPvJvYmKCzDPOUDCYuLI3r1RUiyVh3UZGRoF_Y,6626
8
8
  dask_cuda/get_device_memory_objects.py,sha256=R3U2cq4fJZPgtsUKyIguy9161p3Q99oxmcCmTcg6BtQ,4075
9
9
  dask_cuda/initialize.py,sha256=Gjcxs_c8DTafgsHe5-2mw4lJdOmbFJJAZVOnxA8lTjM,6462
10
10
  dask_cuda/is_device_object.py,sha256=CnajvbQiX0FzFzwft0MqK1OPomx3ZGDnDxT56wNjixw,1046
11
11
  dask_cuda/is_spillable_object.py,sha256=CddGmg0tuSpXh2m_TJSY6GRpnl1WRHt1CRcdWgHPzWA,1457
12
- dask_cuda/local_cuda_cluster.py,sha256=hoEiEfJqAQrRS7N632VatSl1245GiWMT5B77Wc-i5C0,17928
13
- dask_cuda/plugins.py,sha256=cnHsdrXx7PBPmrzHX6YEkCH5byCsUk8LE2FeTeu8ZLU,4259
12
+ dask_cuda/local_cuda_cluster.py,sha256=jgXjd6OvEDfQ3iXU8hV_UfULa13GZsli0SGC2PIouZk,18882
13
+ dask_cuda/plugins.py,sha256=DCf7PnIBu_VNjFfrFeb1zCNuEnCaX9oz4Umn76t02Mc,4630
14
14
  dask_cuda/proxify_device_objects.py,sha256=99CD7LOE79YiQGJ12sYl_XImVhJXpFR4vG5utdkjTQo,8108
15
15
  dask_cuda/proxify_host_file.py,sha256=Wf5CFCC1JN5zmfvND3ls0M5FL01Y8VhHrk0xV3UQ9kk,30850
16
16
  dask_cuda/proxy_object.py,sha256=bZq92kjgFB-ad_luSAFT_RItV3nssmiEk4OOSp34laU,29812
@@ -18,36 +18,37 @@ dask_cuda/utils.py,sha256=RWlLK2cPHaCuNNhr8bW8etBeGklwREQJOafQbTydStk,25121
18
18
  dask_cuda/utils_test.py,sha256=WNMR0gic2tuP3pgygcR9g52NfyX8iGMOan6juXhpkCE,1694
19
19
  dask_cuda/worker_spec.py,sha256=7-Uq_e5q2SkTlsmctMcYLCa9_3RiiVHZLIN7ctfaFmE,4376
20
20
  dask_cuda/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- dask_cuda/benchmarks/common.py,sha256=sEIFnRZS6wbyKCQyB4fDclYLc2YqC0PolurR5qzuRxw,6393
21
+ dask_cuda/benchmarks/common.py,sha256=2MnDdQjvHfGaUWDgiTcTGI_EeKPmVBEwoWfsJUNpOjU,6613
22
22
  dask_cuda/benchmarks/local_cudf_groupby.py,sha256=T9lA9nb4Wzu46AH--SJEVCeCm3650J7slapdNR_08FU,8904
23
23
  dask_cuda/benchmarks/local_cudf_merge.py,sha256=AsuVnMA3H93sJwjjgi4KaIdYKnnX1OeRMPiXizrwHGk,12577
24
24
  dask_cuda/benchmarks/local_cudf_shuffle.py,sha256=2xWJZf3gwDNimXKZN2ivtU3OE_qec1KNOhgL4_AGQZU,8655
25
25
  dask_cuda/benchmarks/local_cupy.py,sha256=aUKIYfeR7c77K4kKk697Rxo8tG8kFabQ9jQEVGr-oTs,10762
26
26
  dask_cuda/benchmarks/local_cupy_map_overlap.py,sha256=_texYmam1K_XbzIvURltui5KRsISGFNylXiGUtgRIz0,6442
27
- dask_cuda/benchmarks/utils.py,sha256=mrQAGbZCqx4N8AC-ASlw-vhDxz060D4i_oSksKZkl2c,27580
27
+ dask_cuda/benchmarks/utils.py,sha256=4k8KnJPOczKDQNBPRWlaGsU2zdEA09BDGgklUXggwMU,30008
28
28
  dask_cuda/explicit_comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  dask_cuda/explicit_comms/comms.py,sha256=Su6PuNo68IyS-AwoqU4S9TmqWsLvUdNa0jot2hx8jQQ,10400
30
30
  dask_cuda/explicit_comms/dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=qJP6WxY0EkuafGrpZDCxeVGuQIoAacYc1SchcpmK0WM,20368
32
- dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=u3kW91YRLdHFycvpGfSQKrEucu5khMJ1k4sjmddO490,4910
33
- dask_cuda/tests/test_dask_cuda_worker.py,sha256=gViHaMCSfB6ip125OEi9D0nfKC-qBXRoHz6BRodEdb4,17729
31
+ dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=4xfhfbTGa36YPs_ex1_fFhzfGMYJq-QkS5q0RwgeHh8,20645
32
+ dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=qVN9J0Hdv66A9COFArLIdRriyyxEKpS3lEZGHbVHaq8,4903
33
+ dask_cuda/tests/test_dask_cuda_worker.py,sha256=o5g0_t-2M_2lfPeOPTS4NVF4rnQF0ZWAZekXw2h0xPc,19610
34
34
  dask_cuda/tests/test_device_host_file.py,sha256=79ssUISo1YhsW_7HdwqPfsH2LRzS2bi5BjPym1Sdgqw,5882
35
35
  dask_cuda/tests/test_dgx.py,sha256=BPCF4ZvhrVKkT43OOFHdijuo-M34vW3V18C8rRH1HXg,7489
36
- dask_cuda/tests/test_explicit_comms.py,sha256=l__DAIHx_DmV71LUEyvDNsLsHYYzafzvy0z_loFwQDo,13686
36
+ dask_cuda/tests/test_explicit_comms.py,sha256=Pa5vVx63qWtScnVJuS31WESXIt2FPyTJVFO-0OUbbmU,15276
37
37
  dask_cuda/tests/test_from_array.py,sha256=okT1B6UqHmLxoy0uER0Ylm3UyOmi5BAXwJpTuTAw44I,601
38
38
  dask_cuda/tests/test_gds.py,sha256=6jf0HPTHAIG8Mp_FC4Ai4zpn-U1K7yk0fSXg8He8-r8,1513
39
39
  dask_cuda/tests/test_initialize.py,sha256=Rba59ZbljEm1yyN94_sWZPEE_f7hWln95aiBVc49pmY,6960
40
- dask_cuda/tests/test_local_cuda_cluster.py,sha256=G3kR-4o-vCqWWfSuQLFKVEK0F243FaDSgRlDTUll5aU,18376
40
+ dask_cuda/tests/test_local_cuda_cluster.py,sha256=Lc9QncyGwBwhaZPGBfreXJf3ZC9Zd8SjDc2fpeQ-BT0,19710
41
41
  dask_cuda/tests/test_proxify_host_file.py,sha256=Yiv0sDcUoWw0d2oiPeHGoHqqSSM4lfQ4rChCiaxb6EU,18994
42
42
  dask_cuda/tests/test_proxy.py,sha256=OnGnPkl5ksCb-3hpEKG2z1OfPK9DbnOCtBHOjcUUjhg,23809
43
43
  dask_cuda/tests/test_spill.py,sha256=xN9PbVERBYMuZxvscSO0mAM22loq9WT3ltZVBFxlmM4,10239
44
44
  dask_cuda/tests/test_utils.py,sha256=JRIwXfemc3lWSzLJX0VcvR1_0wB4yeoOTsw7kB6z6pU,9176
45
+ dask_cuda/tests/test_version.py,sha256=vK2HjlRLX0nxwvRsYxBqhoZryBNZklzA-vdnyuWDxVg,365
45
46
  dask_cuda/tests/test_worker_spec.py,sha256=Bvu85vkqm6ZDAYPXKMJlI2pm9Uc5tiYKNtO4goXSw-I,2399
46
47
  examples/ucx/client_initialize.py,sha256=YN3AXHF8btcMd6NicKKhKR9SXouAsK1foJhFspbOn70,1262
47
48
  examples/ucx/local_cuda_cluster.py,sha256=7xVY3EhwhkY2L4VZin_BiMCbrjhirDNChoC86KiETNc,1983
48
- dask_cuda-24.6.0.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
49
- dask_cuda-24.6.0.dist-info/METADATA,sha256=eHHrrmTxKYk6JuFexzLAz8ybdummYxVAbqadz8fZGro,2570
50
- dask_cuda-24.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
51
- dask_cuda-24.6.0.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
52
- dask_cuda-24.6.0.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
53
- dask_cuda-24.6.0.dist-info/RECORD,,
49
+ dask_cuda-24.8.2.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
50
+ dask_cuda-24.8.2.dist-info/METADATA,sha256=6iMwPI8cWrEYDYz73vm8pw-LkVeEgTQzymJgRxj32VQ,2546
51
+ dask_cuda-24.8.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
52
+ dask_cuda-24.8.2.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
53
+ dask_cuda-24.8.2.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
54
+ dask_cuda-24.8.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (72.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5