cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,77 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from .algorithms import (
6
+ DoubleBuffer,
7
+ SortOrder,
8
+ binary_transform,
9
+ exclusive_scan,
10
+ histogram_even,
11
+ inclusive_scan,
12
+ make_binary_transform,
13
+ make_exclusive_scan,
14
+ make_histogram_even,
15
+ make_inclusive_scan,
16
+ make_merge_sort,
17
+ make_radix_sort,
18
+ make_reduce_into,
19
+ make_segmented_reduce,
20
+ make_three_way_partition,
21
+ make_unary_transform,
22
+ make_unique_by_key,
23
+ merge_sort,
24
+ radix_sort,
25
+ reduce_into,
26
+ segmented_reduce,
27
+ three_way_partition,
28
+ unary_transform,
29
+ unique_by_key,
30
+ )
31
+ from .iterators import (
32
+ CacheModifiedInputIterator,
33
+ ConstantIterator,
34
+ CountingIterator,
35
+ ReverseIterator,
36
+ TransformIterator,
37
+ TransformOutputIterator,
38
+ ZipIterator,
39
+ )
40
+ from .op import OpKind
41
+ from .struct import gpu_struct
42
+
43
+ __all__ = [
44
+ "binary_transform",
45
+ "CacheModifiedInputIterator",
46
+ "ConstantIterator",
47
+ "CountingIterator",
48
+ "DoubleBuffer",
49
+ "exclusive_scan",
50
+ "gpu_struct",
51
+ "histogram_even",
52
+ "inclusive_scan",
53
+ "make_binary_transform",
54
+ "make_exclusive_scan",
55
+ "make_histogram_even",
56
+ "make_inclusive_scan",
57
+ "make_merge_sort",
58
+ "make_radix_sort",
59
+ "make_reduce_into",
60
+ "make_segmented_reduce",
61
+ "make_three_way_partition",
62
+ "make_unary_transform",
63
+ "make_unique_by_key",
64
+ "merge_sort",
65
+ "OpKind",
66
+ "radix_sort",
67
+ "reduce_into",
68
+ "ReverseIterator",
69
+ "segmented_reduce",
70
+ "SortOrder",
71
+ "TransformIterator",
72
+ "TransformOutputIterator",
73
+ "three_way_partition",
74
+ "unary_transform",
75
+ "unique_by_key",
76
+ "ZipIterator",
77
+ ]
@@ -390,6 +390,7 @@ class DeviceHistogramBuildResult:
390
390
  num_rows: int,
391
391
  row_stride_samples: int,
392
392
  is_evenly_segmented: bool,
393
+ info: CommonData,
393
394
  ): ...
394
395
  def compute_even(
395
396
  self,
@@ -403,3 +404,30 @@ class DeviceHistogramBuildResult:
403
404
  row_stride_samples: int,
404
405
  stream,
405
406
  ) -> None: ...
407
+
408
+ # ---------------------
409
+ # DeviceThreeWayPartition
410
+ # ---------------------
411
+
412
+ class DeviceThreeWayPartitionBuildResult:
413
+ def __init__(
414
+ self,
415
+ d_in: Iterator,
416
+ d_first_part_out: Iterator,
417
+ d_second_part_out: Iterator,
418
+ d_unselected_out: Iterator,
419
+ d_num_selected_out: Iterator,
420
+ select_first_part_op: Op,
421
+ select_second_part_op: Op,
422
+ info: CommonData,
423
+ ): ...
424
+ def compute(
425
+ self,
426
+ d_in: Iterator,
427
+ d_first_part_out: Iterator,
428
+ d_second_part_out: Iterator,
429
+ d_unselected_out: Iterator,
430
+ d_num_selected_out: Iterator,
431
+ num_items: int,
432
+ stream,
433
+ ) -> int: ...
@@ -4,7 +4,7 @@
4
4
 
5
5
  # Python signatures are declared in the companion Python stub file _bindings.pyi
6
6
  # Make sure to update PYI with change to Python API to ensure that Python
7
- # static type checker tools like mypy green-lights cuda.cccl.parallel
7
+ # static type checker tools like mypy green-lights cuda.compute
8
8
 
9
9
  from libc.string cimport memset, memcpy
10
10
  from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
@@ -1982,3 +1982,143 @@ cdef class DeviceHistogramBuildResult:
1982
1982
  <const char*>self.build_data.cubin,
1983
1983
  self.build_data.cubin_size
1984
1984
  )
1985
+
1986
+
1987
+ # ----------------------------------
1988
+ # DeviceThreeWayPartitionBuildResult
1989
+ # ----------------------------------
1990
+ cdef extern from "cccl/c/three_way_partition.h":
1991
+ cdef struct cccl_device_three_way_partition_build_result_t 'cccl_device_three_way_partition_build_result_t':
1992
+ const char* cubin
1993
+ size_t cubin_size
1994
+
1995
+ cdef CUresult cccl_device_three_way_partition_build(
1996
+ cccl_device_three_way_partition_build_result_t *build_ptr,
1997
+ cccl_iterator_t d_in,
1998
+ cccl_iterator_t d_first_part_out,
1999
+ cccl_iterator_t d_second_part_out,
2000
+ cccl_iterator_t d_unselected_out,
2001
+ cccl_iterator_t d_num_selected_out,
2002
+ cccl_op_t select_first_part_op,
2003
+ cccl_op_t select_second_part_op,
2004
+ int, int, const char *, const char *, const char *, const char *
2005
+ ) nogil
2006
+
2007
+ CUresult cccl_device_three_way_partition(
2008
+ cccl_device_three_way_partition_build_result_t build,
2009
+ void* d_temp_storage,
2010
+ size_t* temp_storage_bytes,
2011
+ cccl_iterator_t d_in,
2012
+ cccl_iterator_t d_first_part_out,
2013
+ cccl_iterator_t d_second_part_out,
2014
+ cccl_iterator_t d_unselected_out,
2015
+ cccl_iterator_t d_num_selected_out,
2016
+ cccl_op_t select_first_part_op,
2017
+ cccl_op_t select_second_part_op,
2018
+ int64_t num_items,
2019
+ CUstream stream
2020
+ ) nogil
2021
+
2022
+ cdef CUresult cccl_device_three_way_partition_cleanup(
2023
+ cccl_device_three_way_partition_build_result_t *build_ptr
2024
+ ) nogil
2025
+
2026
+
2027
+ cdef class DeviceThreeWayPartitionBuildResult:
2028
+ cdef cccl_device_three_way_partition_build_result_t build_data
2029
+
2030
+ def __dealloc__(DeviceThreeWayPartitionBuildResult self):
2031
+ cdef CUresult status = -1
2032
+ with nogil:
2033
+ status = cccl_device_three_way_partition_cleanup(&self.build_data)
2034
+ if (status != 0):
2035
+ print(f"Return code {status} encountered during three_way_partition result cleanup")
2036
+
2037
+
2038
+ def __cinit__(
2039
+ DeviceThreeWayPartitionBuildResult self,
2040
+ Iterator d_in,
2041
+ Iterator d_first_part_out,
2042
+ Iterator d_second_part_out,
2043
+ Iterator d_unselected_out,
2044
+ Iterator d_num_selected_out,
2045
+ Op select_first_part_op,
2046
+ Op select_second_part_op,
2047
+ CommonData common_data
2048
+ ):
2049
+ cdef CUresult status = -1
2050
+ cdef int cc_major = common_data.get_cc_major()
2051
+ cdef int cc_minor = common_data.get_cc_minor()
2052
+ cdef const char *cub_path = common_data.cub_path_get_c_str()
2053
+ cdef const char *thrust_path = common_data.thrust_path_get_c_str()
2054
+ cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
2055
+ cdef const char *ctk_path = common_data.ctk_path_get_c_str()
2056
+
2057
+ memset(&self.build_data, 0, sizeof(cccl_device_three_way_partition_build_result_t))
2058
+ with nogil:
2059
+ status = cccl_device_three_way_partition_build(
2060
+ &self.build_data,
2061
+ d_in.iter_data,
2062
+ d_first_part_out.iter_data,
2063
+ d_second_part_out.iter_data,
2064
+ d_unselected_out.iter_data,
2065
+ d_num_selected_out.iter_data,
2066
+ select_first_part_op.op_data,
2067
+ select_second_part_op.op_data,
2068
+ cc_major,
2069
+ cc_minor,
2070
+ cub_path,
2071
+ thrust_path,
2072
+ libcudacxx_path,
2073
+ ctk_path,
2074
+ )
2075
+ if status != 0:
2076
+ raise RuntimeError(
2077
+ f"Failed building three_way_partition, error code: {status}"
2078
+ )
2079
+
2080
+ cpdef int compute(
2081
+ DeviceThreeWayPartitionBuildResult self,
2082
+ temp_storage_ptr,
2083
+ temp_storage_bytes,
2084
+ Iterator d_in,
2085
+ Iterator d_first_part_out,
2086
+ Iterator d_second_part_out,
2087
+ Iterator d_unselected_out,
2088
+ Iterator d_num_selected_out,
2089
+ Op select_first_part_op,
2090
+ Op select_second_part_op,
2091
+ size_t num_items,
2092
+ stream
2093
+ ):
2094
+ cdef CUresult status = -1
2095
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
2096
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
2097
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
2098
+
2099
+ with nogil:
2100
+ status = cccl_device_three_way_partition(
2101
+ self.build_data,
2102
+ storage_ptr,
2103
+ &storage_sz,
2104
+ d_in.iter_data,
2105
+ d_first_part_out.iter_data,
2106
+ d_second_part_out.iter_data,
2107
+ d_unselected_out.iter_data,
2108
+ d_num_selected_out.iter_data,
2109
+ select_first_part_op.op_data,
2110
+ select_second_part_op.op_data,
2111
+ <uint64_t>num_items,
2112
+ c_stream
2113
+ )
2114
+ if status != 0:
2115
+ raise RuntimeError(
2116
+ f"Failed executing three_way_partition, error code: {status}"
2117
+ )
2118
+ return storage_sz
2119
+
2120
+ def _get_cubin(self):
2121
+ return PyBytes_FromStringAndSize(
2122
+ <const char*>self.build_data.cubin,
2123
+ self.build_data.cubin_size
2124
+ )
@@ -18,6 +18,8 @@ from ._scan import make_exclusive_scan as make_exclusive_scan
18
18
  from ._scan import make_inclusive_scan as make_inclusive_scan
19
19
  from ._segmented_reduce import make_segmented_reduce as make_segmented_reduce
20
20
  from ._segmented_reduce import segmented_reduce
21
+ from ._three_way_partition import make_three_way_partition as make_three_way_partition
22
+ from ._three_way_partition import three_way_partition as three_way_partition
21
23
  from ._transform import binary_transform, unary_transform
22
24
  from ._transform import make_binary_transform as make_binary_transform
23
25
  from ._transform import make_unary_transform as make_unary_transform
@@ -45,6 +47,8 @@ __all__ = [
45
47
  "make_segmented_reduce",
46
48
  "unique_by_key",
47
49
  "make_unique_by_key",
50
+ "three_way_partition",
51
+ "make_three_way_partition",
48
52
  "DoubleBuffer",
49
53
  "SortOrder",
50
54
  ]
@@ -148,7 +148,7 @@ def make_histogram_even(
148
148
  Example:
149
149
  Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
150
150
 
151
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
151
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
152
152
  :language: python
153
153
  :start-after: # example-begin
154
154
 
@@ -190,7 +190,7 @@ def histogram_even(
190
190
  Example:
191
191
  Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
192
192
 
193
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
193
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
194
194
  :language: python
195
195
  :start-after: # example-begin
196
196
  :caption: Basic histogram example.
@@ -166,7 +166,7 @@ def make_merge_sort(
166
166
  Example:
167
167
  Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
168
168
 
169
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
169
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
170
170
  :language: python
171
171
  :start-after: # example-begin
172
172
 
@@ -201,7 +201,7 @@ def merge_sort(
201
201
  Example:
202
202
  Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
203
203
 
204
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
204
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
205
205
  :language: python
206
206
  :start-after: # example-begin
207
207
 
@@ -222,7 +222,7 @@ def make_radix_sort(
222
222
  Example:
223
223
  Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
224
224
 
225
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
225
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
226
226
  :language: python
227
227
  :start-after: # example-begin
228
228
 
@@ -259,14 +259,14 @@ def radix_sort(
259
259
  Example:
260
260
  Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
261
261
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
262
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
263
263
  :language: python
264
264
  :start-after: # example-begin
265
265
 
266
266
 
267
267
  In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
268
268
 
269
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
269
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
270
270
  :language: python
271
271
  :start-after: # example-begin
272
272
 
@@ -3,8 +3,6 @@
3
3
  #
4
4
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5
5
 
6
- from __future__ import annotations # TODO: required for Python 3.7 docs env
7
-
8
6
  from typing import Callable, Union
9
7
 
10
8
  import numba
@@ -132,7 +130,7 @@ def make_reduce_into(
132
130
  Example:
133
131
  Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
134
132
 
135
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
133
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
136
134
  :language: python
137
135
  :start-after: # example-begin
138
136
 
@@ -165,7 +163,7 @@ def reduce_into(
165
163
  Example:
166
164
  Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
167
165
 
168
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
166
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
169
167
  :language: python
170
168
  :start-after: # example-begin
171
169
 
@@ -3,8 +3,6 @@
3
3
  #
4
4
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5
5
 
6
- from __future__ import annotations # TODO: required for Python 3.7 docs env
7
-
8
6
  from typing import Callable, Union
9
7
 
10
8
  import numba
@@ -143,7 +141,7 @@ def make_exclusive_scan(
143
141
  Example:
144
142
  Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
145
143
 
146
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
144
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
147
145
  :language: python
148
146
  :start-after: # example-begin
149
147
 
@@ -176,7 +174,7 @@ def exclusive_scan(
176
174
  Example:
177
175
  Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
178
176
 
179
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
177
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
180
178
  :language: python
181
179
  :start-after: # example-begin
182
180
 
@@ -209,7 +207,7 @@ def make_inclusive_scan(
209
207
  Example:
210
208
  Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
211
209
 
212
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
210
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
213
211
  :language: python
214
212
  :start-after: # example-begin
215
213
 
@@ -242,7 +240,7 @@ def inclusive_scan(
242
240
  Example:
243
241
  Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
244
242
 
245
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
243
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
246
244
  :language: python
247
245
  :start-after: # example-begin
248
246
 
@@ -179,7 +179,7 @@ def make_segmented_reduce(
179
179
  Example:
180
180
  Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
181
181
 
182
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
182
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
183
183
  :language: python
184
184
  :start-after: # example-begin
185
185
 
@@ -216,7 +216,7 @@ def segmented_reduce(
216
216
  Example:
217
217
  Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222