cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  CUB_BLOCK_SCAN_ALGOS,
11
11
  CudaSharedMemConfig,
12
12
  dim3,
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
14
14
  normalize_dim_param,
15
15
  normalize_dtype_param,
16
16
  )
17
- from cuda.cccl.cooperative.experimental._types import (
17
+ from .._types import (
18
18
  Algorithm,
19
19
  Dependency,
20
20
  DependentArray,
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
140
140
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
141
141
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
142
142
 
143
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
143
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
144
144
  :language: python
145
145
  :dedent:
146
146
  :start-after: example-begin imports
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
148
148
 
149
149
  Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
150
150
 
151
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
151
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
152
152
  :language: python
153
153
  :dedent:
154
154
  :start-after: example-begin radix-sort
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
181
181
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
182
182
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
183
183
 
184
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
184
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
185
185
  :language: python
186
186
  :dedent:
187
187
  :start-after: example-begin imports
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
189
189
 
190
190
  Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
191
191
 
192
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
192
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
193
193
  :language: python
194
194
  :dedent:
195
195
  :start-after: example-begin radix-sort-descending
@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  CUB_BLOCK_REDUCE_ALGOS,
11
11
  make_binary_tempfile,
12
12
  normalize_dim_param,
13
13
  normalize_dtype_param,
14
14
  )
15
- from cuda.cccl.cooperative.experimental._types import (
15
+ from .._types import (
16
16
  Algorithm,
17
17
  Dependency,
18
18
  DependentArray,
@@ -208,13 +208,13 @@ def reduce(
208
208
  The code snippet below illustrates a max reduction of 128 integer items that are
209
209
  partitioned across 128 threads.
210
210
 
211
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
211
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
212
212
  :language: python
213
213
  :dedent:
214
214
  :start-after: example-begin imports
215
215
  :end-before: example-end imports
216
216
 
217
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
218
218
  :language: python
219
219
  :dedent:
220
220
  :start-after: example-begin reduce
@@ -269,13 +269,13 @@ def sum(
269
269
  The code snippet below illustrates a sum of 128 integer items that are partitioned
270
270
  across 128 threads.
271
271
 
272
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
272
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
273
273
  :language: python
274
274
  :dedent:
275
275
  :start-after: example-begin imports
276
276
  :end-before: example-end imports
277
277
 
278
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
278
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
279
279
  :language: python
280
280
  :dedent:
281
281
  :start-after: example-begin sum
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.block_scan
6
+ cuda.coop.block_scan
7
7
  ===========================
8
8
 
9
9
  This module provides a set of :ref:`collective <collective-primitives>`
@@ -73,16 +73,16 @@ from typing import Any, Callable, Literal
73
73
 
74
74
  import numba
75
75
 
76
- from cuda.cccl.cooperative.experimental._common import (
76
+ from .._common import (
77
77
  CUB_BLOCK_SCAN_ALGOS,
78
78
  make_binary_tempfile,
79
79
  normalize_dim_param,
80
80
  normalize_dtype_param,
81
81
  )
82
- from cuda.cccl.cooperative.experimental._scan_op import (
82
+ from .._scan_op import (
83
83
  ScanOp,
84
84
  )
85
- from cuda.cccl.cooperative.experimental._types import (
85
+ from .._types import (
86
86
  Algorithm,
87
87
  Dependency,
88
88
  DependentArray,
@@ -94,7 +94,7 @@ from cuda.cccl.cooperative.experimental._types import (
94
94
  TemplateParameter,
95
95
  numba_type_to_wrapper,
96
96
  )
97
- from cuda.cccl.cooperative.experimental._typing import (
97
+ from .._typing import (
98
98
  DimType,
99
99
  DtypeType,
100
100
  ScanOpType,
@@ -669,7 +669,7 @@ def exclusive_sum(
669
669
  :ref:`blocked arrangement <flexible-data-arrangement>` across 128
670
670
  threads where each thread owns 4 consecutive items.
671
671
 
672
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
672
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
673
673
  :language: python
674
674
  :dedent:
675
675
  :start-after: example-begin imports
@@ -678,7 +678,7 @@ def exclusive_sum(
678
678
  Below is the code snippet that demonstrates the usage of the
679
679
  ``exclusive_sum`` API:
680
680
 
681
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
681
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
682
682
  :language: python
683
683
  :dedent:
684
684
  :start-after: example-begin exclusive-sum
@@ -0,0 +1,9 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from ._warp_merge_sort import merge_sort_keys
6
+ from ._warp_reduce import reduce, sum
7
+ from ._warp_scan import exclusive_sum
8
+
9
+ __all__ = ["exclusive_sum", "reduce", "sum", "merge_sort_keys"]
@@ -4,8 +4,8 @@
4
4
 
5
5
  import numba
6
6
 
7
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
8
- from cuda.cccl.cooperative.experimental._types import (
7
+ from .._common import make_binary_tempfile
8
+ from .._types import (
9
9
  Algorithm,
10
10
  Constant,
11
11
  Dependency,
@@ -30,7 +30,7 @@ def merge_sort_keys(
30
30
 
31
31
  Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
32
32
 
33
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
33
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_merge_sort_api.py
34
34
  :language: python
35
35
  :dedent:
36
36
  :start-after: example-begin merge-sort
@@ -4,8 +4,8 @@
4
4
 
5
5
  import numba
6
6
 
7
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
8
- from cuda.cccl.cooperative.experimental._types import (
7
+ from .._common import make_binary_tempfile
8
+ from .._types import (
9
9
  Algorithm,
10
10
  Dependency,
11
11
  DependentPythonOperator,
@@ -28,7 +28,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
28
28
  The code snippet below illustrates a max reduction of 32 integer items that
29
29
  are partitioned across a warp of threads.
30
30
 
31
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
31
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
32
32
  :language: python
33
33
  :dedent:
34
34
  :start-after: example-begin imports
@@ -36,7 +36,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
36
36
 
37
37
  Below is the code snippet that demonstrates the usage of the ``reduce`` API:
38
38
 
39
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
39
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
40
40
  :language: python
41
41
  :dedent:
42
42
  :start-after: example-begin reduce
@@ -100,7 +100,7 @@ def sum(dtype, threads_in_warp=32):
100
100
  The code snippet below illustrates a reduction of 32 integer items that
101
101
  are partitioned across a warp of threads.
102
102
 
103
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
103
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
104
104
  :language: python
105
105
  :dedent:
106
106
  :start-after: example-begin imports
@@ -108,7 +108,7 @@ def sum(dtype, threads_in_warp=32):
108
108
 
109
109
  Below is the code snippet that demonstrates the usage of the ``reduce`` API:
110
110
 
111
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
111
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
112
112
  :language: python
113
113
  :dedent:
114
114
  :start-after: example-begin sum
@@ -5,8 +5,8 @@
5
5
 
6
6
  import numba
7
7
 
8
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
9
- from cuda.cccl.cooperative.experimental._types import (
8
+ from .._common import make_binary_tempfile
9
+ from .._types import (
10
10
  Algorithm,
11
11
  Dependency,
12
12
  DependentReference,
@@ -23,7 +23,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
23
23
  Example:
24
24
  The code snippet below illustrates an exclusive prefix sum of 32 integer items:
25
25
 
26
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
26
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
27
27
  :language: python
28
28
  :dedent:
29
29
  :start-after: example-begin imports
@@ -31,7 +31,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
31
31
 
32
32
  Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
33
33
 
34
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
34
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
35
35
  :language: python
36
36
  :dedent:
37
37
  :start-after: example-begin exclusive-sum
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cuda-cccl
3
- Version: 0.1.3.2.0.dev438
3
+ Version: 0.3.1
4
4
  Summary: CUDA Core Library for Python
5
5
  Author: NVIDIA Corporation
6
6
  Classifier: Programming Language :: Python :: 3 :: Only