cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -166,7 +166,7 @@ def make_merge_sort(
166
166
  Example:
167
167
  Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
168
168
 
169
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
169
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
170
170
  :language: python
171
171
  :start-after: # example-begin
172
172
 
@@ -201,7 +201,7 @@ def merge_sort(
201
201
  Example:
202
202
  Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
203
203
 
204
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
204
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
205
205
  :language: python
206
206
  :start-after: # example-begin
207
207
 
@@ -222,7 +222,7 @@ def make_radix_sort(
222
222
  Example:
223
223
  Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
224
224
 
225
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
225
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
226
226
  :language: python
227
227
  :start-after: # example-begin
228
228
 
@@ -259,14 +259,14 @@ def radix_sort(
259
259
  Example:
260
260
  Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
261
261
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
262
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
263
263
  :language: python
264
264
  :start-after: # example-begin
265
265
 
266
266
 
267
267
  In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
268
268
 
269
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
269
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
270
270
  :language: python
271
271
  :start-after: # example-begin
272
272
 
@@ -130,7 +130,7 @@ def make_reduce_into(
130
130
  Example:
131
131
  Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
132
132
 
133
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
133
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
134
134
  :language: python
135
135
  :start-after: # example-begin
136
136
 
@@ -163,7 +163,7 @@ def reduce_into(
163
163
  Example:
164
164
  Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
165
165
 
166
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
166
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
167
167
  :language: python
168
168
  :start-after: # example-begin
169
169
 
@@ -141,7 +141,7 @@ def make_exclusive_scan(
141
141
  Example:
142
142
  Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
143
143
 
144
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
144
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
145
145
  :language: python
146
146
  :start-after: # example-begin
147
147
 
@@ -174,7 +174,7 @@ def exclusive_scan(
174
174
  Example:
175
175
  Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
176
176
 
177
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
177
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
178
178
  :language: python
179
179
  :start-after: # example-begin
180
180
 
@@ -207,7 +207,7 @@ def make_inclusive_scan(
207
207
  Example:
208
208
  Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
209
209
 
210
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
210
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
211
211
  :language: python
212
212
  :start-after: # example-begin
213
213
 
@@ -240,7 +240,7 @@ def inclusive_scan(
240
240
  Example:
241
241
  Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
242
242
 
243
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
243
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
244
244
  :language: python
245
245
  :start-after: # example-begin
246
246
 
@@ -179,7 +179,7 @@ def make_segmented_reduce(
179
179
  Example:
180
180
  Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
181
181
 
182
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
182
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
183
183
  :language: python
184
184
  :start-after: # example-begin
185
185
 
@@ -216,7 +216,7 @@ def segmented_reduce(
216
216
  Example:
217
217
  Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222
 
@@ -165,7 +165,7 @@ def make_three_way_partition(
165
165
  Example:
166
166
  Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
167
167
 
168
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
168
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
169
169
  :language: python
170
170
  :start-after: # example-begin
171
171
 
@@ -214,7 +214,7 @@ def three_way_partition(
214
214
  Example:
215
215
  Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
216
216
 
217
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
218
218
  :language: python
219
219
  :start-after: # example-begin
220
220
 
@@ -196,7 +196,7 @@ def make_unary_transform(
196
196
  storage allocation. For simpler usage, consider using :func:`unary_transform`.
197
197
 
198
198
  Example:
199
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
199
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
200
200
  :language: python
201
201
  :start-after: # example-begin
202
202
 
@@ -227,7 +227,7 @@ def make_binary_transform(
227
227
  storage allocation. For simpler usage, consider using :func:`binary_transform`.
228
228
 
229
229
  Example:
230
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
230
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
231
231
  :language: python
232
232
  :start-after: # example-begin
233
233
 
@@ -259,7 +259,7 @@ def unary_transform(
259
259
  Example:
260
260
  Below, ``unary_transform`` is used to apply a transformation to each element of the input.
261
261
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
262
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
263
263
  :language: python
264
264
  :start-after: # example-begin
265
265
 
@@ -291,7 +291,7 @@ def binary_transform(
291
291
  Example:
292
292
  Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
293
293
 
294
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
294
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
295
295
  :language: python
296
296
  :start-after: # example-begin
297
297
 
@@ -171,7 +171,7 @@ def make_unique_by_key(
171
171
  Example:
172
172
  Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
173
173
 
174
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
174
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
175
175
  :language: python
176
176
  :start-after: # example-begin
177
177
 
@@ -211,7 +211,7 @@ def unique_by_key(
211
211
  Example:
212
212
  Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
213
213
 
214
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
214
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
215
215
  :language: python
216
216
  :start-after: # example-begin
217
217
 
@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
26
26
  Example:
27
27
  The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
28
28
 
29
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
29
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
30
30
  :language: python
31
31
  :start-after: # example-begin
32
32
 
@@ -55,7 +55,7 @@ def ConstantIterator(value):
55
55
  The code snippet below demonstrates the usage of a ``ConstantIterator``
56
56
  representing a sequence of constant values:
57
57
 
58
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
58
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
59
59
  :language: python
60
60
  :start-after: # example-begin
61
61
 
@@ -78,7 +78,7 @@ def CountingIterator(offset):
78
78
  The code snippet below demonstrates the usage of a ``CountingIterator``
79
79
  representing the sequence ``[10, 11, 12]``:
80
80
 
81
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
81
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
82
82
  :language: python
83
83
  :start-after: # example-begin
84
84
 
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
100
100
  Examples:
101
101
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
102
102
 
103
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
103
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
104
104
  :language: python
105
105
  :start-after: # example-begin
106
106
 
107
107
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
108
108
 
109
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
109
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
110
110
  :language: python
111
111
  :start-after: # example-begin
112
112
 
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
129
129
  The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
130
130
  to transform the input before performing a reduction.
131
131
 
132
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
132
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
133
133
  :language: python
134
134
  :start-after: # example-begin
135
135
  Args:
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
151
151
  The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
152
152
  of a reduction before writing to an output array.
153
153
 
154
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
154
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
155
155
  :language: python
156
156
  :start-after: # example-begin
157
157
 
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
178
178
  The code snippet below demonstrates the usage of a ``ZipIterator``
179
179
  combining two device arrays:
180
180
 
181
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
181
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
182
182
  :language: python
183
183
  :start-after: # example-begin
184
184
 
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
207
207
  to a dataclass). The type of each field must be a subclass of
208
208
  `np.number`, like `np.int32` or `np.float64`.
209
209
 
210
- Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
210
+ Arrays of GPUStruct objects can be used as inputs to cuda.compute
211
211
  algorithms.
212
212
 
213
213
  Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
216
216
  a reduction on an input array of floating point values to compute its
217
217
  the smallest and the largest absolute values:
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222
 
cuda/coop/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from . import block, warp
6
+ from ._types import StatefulFunction
7
+
8
+ __all__ = ["block", "warp", "StatefulFunction"]
@@ -5,8 +5,9 @@
5
5
  import functools
6
6
 
7
7
  from cuda.bindings import nvrtc
8
- from cuda.cccl.cooperative.experimental._caching import disk_cache
9
- from cuda.cccl.cooperative.experimental._common import check_in, version
8
+
9
+ from ._caching import disk_cache
10
+ from ._common import check_in, version
10
11
 
11
12
 
12
13
  def CHECK_NVRTC(err, prog):
@@ -3,8 +3,8 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.experimental._scan_op
7
- ======================================
6
+ cuda.coop._scan_op
7
+ ==================
8
8
 
9
9
  This module implements the ``ScanOp`` class and related functions.
10
10
  """
@@ -14,7 +14,7 @@ from enum import Enum
14
14
 
15
15
  import numpy as np
16
16
 
17
- from cuda.cccl.cooperative.experimental._typing import (
17
+ from ._typing import (
18
18
  ScanOpType,
19
19
  )
20
20
 
@@ -17,8 +17,8 @@ from numba.core.typing import signature
17
17
  from numba.cuda import LTOIR
18
18
  from numba.cuda.cudadrv import driver as cuda_driver
19
19
 
20
- import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
21
- from cuda.cccl.cooperative.experimental._common import find_unsigned
20
+ from . import _nvrtc as nvrtc
21
+ from ._common import find_unsigned
22
22
 
23
23
  NUMBA_TYPES_TO_CPP = {
24
24
  types.boolean: "bool",
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  import numba
10
10
  import numpy as np
11
11
 
12
- from cuda.cccl.cooperative.experimental._common import dim3
12
+ from ._common import dim3
13
13
 
14
14
  # Type alias for dimension parameters that can be passed to CUDA functions.
15
15
  DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
@@ -2,18 +2,18 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
- from cuda.cccl.cooperative.experimental.block._block_exchange import (
5
+ from ._block_exchange import (
6
6
  BlockExchangeType,
7
7
  exchange,
8
8
  )
9
- from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
10
- from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
11
- from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
9
+ from ._block_load_store import load, store
10
+ from ._block_merge_sort import merge_sort_keys
11
+ from ._block_radix_sort import (
12
12
  radix_sort_keys,
13
13
  radix_sort_keys_descending,
14
14
  )
15
- from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
16
- from cuda.cccl.cooperative.experimental.block._block_scan import (
15
+ from ._block_reduce import reduce, sum
16
+ from ._block_scan import (
17
17
  exclusive_scan,
18
18
  exclusive_sum,
19
19
  inclusive_scan,
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.block_exchange
6
+ cuda.coop.block_exchange
7
7
  ====================================
8
8
 
9
9
  This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
105
105
  perform. Currently, only :py:attr:`StripedToBlocked` is supported.
106
106
 
107
107
  :param dtype: Supplies the data type of the input and output arrays.
108
- :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
108
+ :type dtype: :py:class:`cuda.coop._typing.DtypeType`
109
109
 
110
110
  :param threads_per_block: Supplies the number of threads in the block,
111
111
  either as an integer for a 1D block or a tuple of two or three integers
112
112
  for a 2D or 3D block, respectively.
113
113
  :type threads_per_block:
114
- :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
114
+ :py:class:`cuda.coop._typing.DimType`
115
115
 
116
116
  :param items_per_thread: Supplies the number of items partitioned onto each
117
117
  thread.
@@ -137,7 +137,7 @@ def exchange(
137
137
  :raises ValueError: If ``items_per_thread`` is greater than 1 and
138
138
  ``methods`` is not *None* (i.e. a user-defined type is being used).
139
139
 
140
- :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
140
+ :returns: An :py:class:`cuda.coop._types.Invocable`
141
141
  object representing the specialized kernel that call be called from
142
142
  a Numba JIT'd CUDA kernel.
143
143
 
@@ -5,12 +5,12 @@
5
5
 
6
6
  import numba
7
7
 
8
- from cuda.cccl.cooperative.experimental._common import (
8
+ from .._common import (
9
9
  make_binary_tempfile,
10
10
  normalize_dim_param,
11
11
  normalize_dtype_param,
12
12
  )
13
- from cuda.cccl.cooperative.experimental._types import (
13
+ from .._types import (
14
14
  Algorithm,
15
15
  Dependency,
16
16
  DependentArray,
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
70
70
  The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
71
71
  each thread handling 4 integers.
72
72
 
73
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
73
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
74
74
  :language: python
75
75
  :dedent:
76
76
  :start-after: example-begin imports
77
77
  :end-before: example-end imports
78
78
 
79
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
79
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
80
80
  :language: python
81
81
  :dedent:
82
82
  :start-after: example-begin load_store
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
158
158
  The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
159
159
  each thread handling 4 integers.
160
160
 
161
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
161
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
162
162
  :language: python
163
163
  :dedent:
164
164
  :start-after: example-begin imports
165
165
  :end-before: example-end imports
166
166
 
167
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
167
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
168
168
  :language: python
169
169
  :dedent:
170
170
  :start-after: example-begin load_store
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  make_binary_tempfile,
11
11
  normalize_dim_param,
12
12
  normalize_dtype_param,
13
13
  )
14
- from cuda.cccl.cooperative.experimental._types import (
14
+ from .._types import (
15
15
  Algorithm,
16
16
  Constant,
17
17
  Dependency,
@@ -41,7 +41,7 @@ def merge_sort_keys(
41
41
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
42
42
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
43
43
 
44
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
44
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
45
45
  :language: python
46
46
  :dedent:
47
47
  :start-after: example-begin imports
@@ -49,7 +49,7 @@ def merge_sort_keys(
49
49
 
50
50
  Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
51
51
 
52
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
52
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
53
53
  :language: python
54
54
  :dedent:
55
55
  :start-after: example-begin merge-sort
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  CUB_BLOCK_SCAN_ALGOS,
11
11
  CudaSharedMemConfig,
12
12
  dim3,
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
14
14
  normalize_dim_param,
15
15
  normalize_dtype_param,
16
16
  )
17
- from cuda.cccl.cooperative.experimental._types import (
17
+ from .._types import (
18
18
  Algorithm,
19
19
  Dependency,
20
20
  DependentArray,
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
140
140
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
141
141
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
142
142
 
143
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
143
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
144
144
  :language: python
145
145
  :dedent:
146
146
  :start-after: example-begin imports
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
148
148
 
149
149
  Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
150
150
 
151
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
151
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
152
152
  :language: python
153
153
  :dedent:
154
154
  :start-after: example-begin radix-sort
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
181
181
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
182
182
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
183
183
 
184
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
184
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
185
185
  :language: python
186
186
  :dedent:
187
187
  :start-after: example-begin imports
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
189
189
 
190
190
  Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
191
191
 
192
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
192
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
193
193
  :language: python
194
194
  :dedent:
195
195
  :start-after: example-begin radix-sort-descending
@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  CUB_BLOCK_REDUCE_ALGOS,
11
11
  make_binary_tempfile,
12
12
  normalize_dim_param,
13
13
  normalize_dtype_param,
14
14
  )
15
- from cuda.cccl.cooperative.experimental._types import (
15
+ from .._types import (
16
16
  Algorithm,
17
17
  Dependency,
18
18
  DependentArray,
@@ -208,13 +208,13 @@ def reduce(
208
208
  The code snippet below illustrates a max reduction of 128 integer items that are
209
209
  partitioned across 128 threads.
210
210
 
211
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
211
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
212
212
  :language: python
213
213
  :dedent:
214
214
  :start-after: example-begin imports
215
215
  :end-before: example-end imports
216
216
 
217
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
218
218
  :language: python
219
219
  :dedent:
220
220
  :start-after: example-begin reduce
@@ -269,13 +269,13 @@ def sum(
269
269
  The code snippet below illustrates a sum of 128 integer items that are partitioned
270
270
  across 128 threads.
271
271
 
272
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
272
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
273
273
  :language: python
274
274
  :dedent:
275
275
  :start-after: example-begin imports
276
276
  :end-before: example-end imports
277
277
 
278
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
278
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
279
279
  :language: python
280
280
  :dedent:
281
281
  :start-after: example-begin sum