cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.block_scan
6
+ cuda.coop.block_scan
7
7
  ===========================
8
8
 
9
9
  This module provides a set of :ref:`collective <collective-primitives>`
@@ -73,16 +73,16 @@ from typing import Any, Callable, Literal
73
73
 
74
74
  import numba
75
75
 
76
- from cuda.cccl.cooperative.experimental._common import (
76
+ from .._common import (
77
77
  CUB_BLOCK_SCAN_ALGOS,
78
78
  make_binary_tempfile,
79
79
  normalize_dim_param,
80
80
  normalize_dtype_param,
81
81
  )
82
- from cuda.cccl.cooperative.experimental._scan_op import (
82
+ from .._scan_op import (
83
83
  ScanOp,
84
84
  )
85
- from cuda.cccl.cooperative.experimental._types import (
85
+ from .._types import (
86
86
  Algorithm,
87
87
  Dependency,
88
88
  DependentArray,
@@ -94,7 +94,7 @@ from cuda.cccl.cooperative.experimental._types import (
94
94
  TemplateParameter,
95
95
  numba_type_to_wrapper,
96
96
  )
97
- from cuda.cccl.cooperative.experimental._typing import (
97
+ from .._typing import (
98
98
  DimType,
99
99
  DtypeType,
100
100
  ScanOpType,
@@ -669,7 +669,7 @@ def exclusive_sum(
669
669
  :ref:`blocked arrangement <flexible-data-arrangement>` across 128
670
670
  threads where each thread owns 4 consecutive items.
671
671
 
672
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
672
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
673
673
  :language: python
674
674
  :dedent:
675
675
  :start-after: example-begin imports
@@ -678,7 +678,7 @@ def exclusive_sum(
678
678
  Below is the code snippet that demonstrates the usage of the
679
679
  ``exclusive_sum`` API:
680
680
 
681
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
681
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
682
682
  :language: python
683
683
  :dedent:
684
684
  :start-after: example-begin exclusive-sum
@@ -0,0 +1,9 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from ._warp_merge_sort import merge_sort_keys
6
+ from ._warp_reduce import reduce, sum
7
+ from ._warp_scan import exclusive_sum
8
+
9
+ __all__ = ["exclusive_sum", "reduce", "sum", "merge_sort_keys"]
@@ -4,8 +4,8 @@
4
4
 
5
5
  import numba
6
6
 
7
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
8
- from cuda.cccl.cooperative.experimental._types import (
7
+ from .._common import make_binary_tempfile
8
+ from .._types import (
9
9
  Algorithm,
10
10
  Constant,
11
11
  Dependency,
@@ -30,7 +30,7 @@ def merge_sort_keys(
30
30
 
31
31
  Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
32
32
 
33
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
33
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_merge_sort_api.py
34
34
  :language: python
35
35
  :dedent:
36
36
  :start-after: example-begin merge-sort
@@ -4,8 +4,8 @@
4
4
 
5
5
  import numba
6
6
 
7
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
8
- from cuda.cccl.cooperative.experimental._types import (
7
+ from .._common import make_binary_tempfile
8
+ from .._types import (
9
9
  Algorithm,
10
10
  Dependency,
11
11
  DependentPythonOperator,
@@ -28,7 +28,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
28
28
  The code snippet below illustrates a max reduction of 32 integer items that
29
29
  are partitioned across a warp of threads.
30
30
 
31
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
31
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
32
32
  :language: python
33
33
  :dedent:
34
34
  :start-after: example-begin imports
@@ -36,7 +36,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
36
36
 
37
37
  Below is the code snippet that demonstrates the usage of the ``reduce`` API:
38
38
 
39
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
39
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
40
40
  :language: python
41
41
  :dedent:
42
42
  :start-after: example-begin reduce
@@ -100,7 +100,7 @@ def sum(dtype, threads_in_warp=32):
100
100
  The code snippet below illustrates a reduction of 32 integer items that
101
101
  are partitioned across a warp of threads.
102
102
 
103
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
103
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
104
104
  :language: python
105
105
  :dedent:
106
106
  :start-after: example-begin imports
@@ -108,7 +108,7 @@ def sum(dtype, threads_in_warp=32):
108
108
 
109
109
  Below is the code snippet that demonstrates the usage of the ``reduce`` API:
110
110
 
111
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
111
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
112
112
  :language: python
113
113
  :dedent:
114
114
  :start-after: example-begin sum
@@ -5,8 +5,8 @@
5
5
 
6
6
  import numba
7
7
 
8
- from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
9
- from cuda.cccl.cooperative.experimental._types import (
8
+ from .._common import make_binary_tempfile
9
+ from .._types import (
10
10
  Algorithm,
11
11
  Dependency,
12
12
  DependentReference,
@@ -23,7 +23,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
23
23
  Example:
24
24
  The code snippet below illustrates an exclusive prefix sum of 32 integer items:
25
25
 
26
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
26
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
27
27
  :language: python
28
28
  :dedent:
29
29
  :start-after: example-begin imports
@@ -31,7 +31,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
31
31
 
32
32
  Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
33
33
 
34
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
34
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
35
35
  :language: python
36
36
  :dedent:
37
37
  :start-after: example-begin exclusive-sum
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cuda-cccl
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: CUDA Core Library for Python
5
5
  Author: NVIDIA Corporation
6
6
  Classifier: Programming Language :: Python :: 3 :: Only