cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
328
328
  // Alias the allocation for the privatized per-block reductions
329
329
  deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
330
330
 
331
- if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
332
- {
333
- return cudaErrorInvalidValue;
334
- }
335
-
336
331
  auto d_chunk_block_reductions = d_block_reductions;
337
332
  for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
338
333
  {
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
372
367
 
373
368
  if (chunk_index + 1 < num_chunks)
374
369
  {
375
- detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
370
+ d_in += num_current_items;
376
371
  d_chunk_block_reductions += current_grid_size;
377
372
  }
378
373
 
@@ -20,7 +20,6 @@
20
20
 
21
21
  #include <cub/detail/launcher/cuda_runtime.cuh>
22
22
  #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
23
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
24
23
  #include <cub/device/dispatch/kernels/reduce.cuh>
25
24
  #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
26
25
  #include <cub/grid/grid_even_share.cuh>
@@ -40,7 +40,6 @@
40
40
  #include <cub/detail/device_double_buffer.cuh>
41
41
  #include <cub/detail/temporary_storage.cuh>
42
42
  #include <cub/device/device_partition.cuh>
43
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
44
43
  #include <cub/device/dispatch/kernels/segmented_sort.cuh>
45
44
  #include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
46
45
  #include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
764
763
  BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
765
764
  EndOffsetIteratorT current_end_offset = d_end_offsets;
766
765
 
767
- detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
768
- detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
766
+ current_begin_offset += current_seg_offset;
767
+ current_end_offset += current_seg_offset;
769
768
 
770
769
  auto medium_indices_iterator =
771
770
  ::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
@@ -47,9 +47,7 @@
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
50
- namespace detail
51
- {
52
- namespace reduce
50
+ namespace detail::reduce
53
51
  {
54
52
 
55
53
  /**
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
580
578
  }
581
579
  }
582
580
 
583
- } // namespace reduce
584
- } // namespace detail
581
+ } // namespace detail::reduce
585
582
 
586
583
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace scan
45
+ namespace detail::scan
48
46
  {
49
47
 
50
48
  /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
186
184
  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
187
185
  }
188
186
 
189
- } // namespace scan
190
- } // namespace detail
187
+ } // namespace detail::scan
191
188
 
192
189
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace reduce
46
+ namespace detail::reduce
49
47
  {
50
48
 
51
49
  /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
318
316
  }
319
317
  }
320
318
 
321
- } // namespace reduce
322
- } // namespace detail
319
+ } // namespace detail::reduce
323
320
 
324
321
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace adjacent_difference
46
+ namespace detail::adjacent_difference
49
47
  {
50
48
  template <typename InputIteratorT, bool MayAlias>
51
49
  struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
64
62
 
65
63
  using MaxPolicy = Policy500;
66
64
  };
67
- } // namespace adjacent_difference
68
- } // namespace detail
65
+ } // namespace detail::adjacent_difference
69
66
 
70
67
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace batch_memcpy
46
+ namespace detail::batch_memcpy
49
47
  {
50
48
  /**
51
49
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
115
113
 
116
114
  using MaxPolicy = Policy700;
117
115
  };
118
- } // namespace batch_memcpy
119
- } // namespace detail
116
+ } // namespace detail::batch_memcpy
120
117
 
121
118
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace for_each
45
+ namespace detail::for_each
48
46
  {
49
47
 
50
48
  struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
57
55
  using MaxPolicy = policy_500_t;
58
56
  };
59
57
 
60
- } // namespace for_each
61
- } // namespace detail
58
+ } // namespace detail::for_each
62
59
 
63
60
  CUB_NAMESPACE_END
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace histogram
49
+ namespace detail::histogram
52
50
  {
53
51
  enum class primitive_sample
54
52
  {
@@ -272,7 +270,6 @@ struct policy_hub
272
270
 
273
271
  using MaxPolicy = Policy1000;
274
272
  };
275
- } // namespace histogram
276
- } // namespace detail
273
+ } // namespace detail::histogram
277
274
 
278
275
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace merge
45
+ namespace detail::merge
48
46
  {
49
47
  template <typename KeyT, typename ValueT>
50
48
  struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
73
71
 
74
72
  using max_policy = policy600;
75
73
  };
76
- } // namespace merge
77
- } // namespace detail
74
+ } // namespace detail::merge
78
75
 
79
76
  CUB_NAMESPACE_END
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
62
62
  {}
63
63
 
64
64
  CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
65
+
66
+ #if defined(CUB_ENABLE_POLICY_PTX_JSON)
67
+ _CCCL_DEVICE static constexpr auto EncodedPolicy()
68
+ {
69
+ using namespace ptx_json;
70
+ return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
71
+ }
72
+ #endif
65
73
  };
66
74
 
67
75
  template <typename PolicyT>
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace radix
49
+ namespace detail::radix
52
50
  {
53
51
  // sm90 default
54
52
  template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
1062
1060
  using MaxPolicy = Policy1000;
1063
1061
  };
1064
1062
 
1065
- } // namespace radix
1066
- } // namespace detail
1063
+ } // namespace detail::radix
1067
1064
 
1068
1065
  CUB_NAMESPACE_END
@@ -50,9 +50,7 @@
50
50
 
51
51
  CUB_NAMESPACE_BEGIN
52
52
 
53
- namespace detail
54
- {
55
- namespace reduce_by_key
53
+ namespace detail::reduce_by_key
56
54
  {
57
55
  enum class primitive_key
58
56
  {
@@ -939,7 +937,6 @@ struct policy_hub
939
937
  };
940
938
  using MaxPolicy = Policy1000;
941
939
  };
942
- } // namespace reduce_by_key
943
- } // namespace detail
940
+ } // namespace detail::reduce_by_key
944
941
 
945
942
  CUB_NAMESPACE_END
@@ -52,9 +52,7 @@
52
52
 
53
53
  CUB_NAMESPACE_BEGIN
54
54
 
55
- namespace detail
56
- {
57
- namespace rle
55
+ namespace detail::rle
58
56
  {
59
57
  enum class primitive_key
60
58
  {
@@ -670,7 +668,6 @@ struct policy_hub
670
668
  using MaxPolicy = Policy1000;
671
669
  };
672
670
  } // namespace non_trivial_runs
673
- } // namespace rle
674
- } // namespace detail
671
+ } // namespace detail::rle
675
672
 
676
673
  CUB_NAMESPACE_END
@@ -53,9 +53,7 @@
53
53
 
54
54
  CUB_NAMESPACE_BEGIN
55
55
 
56
- namespace detail
57
- {
58
- namespace scan
56
+ namespace detail::scan
59
57
  {
60
58
  enum class keep_rejects
61
59
  {
@@ -615,7 +613,6 @@ struct policy_hub
615
613
 
616
614
  using MaxPolicy = Policy1000;
617
615
  };
618
- } // namespace scan
619
- } // namespace detail
616
+ } // namespace detail::scan
620
617
 
621
618
  CUB_NAMESPACE_END
@@ -49,9 +49,7 @@
49
49
 
50
50
  CUB_NAMESPACE_BEGIN
51
51
 
52
- namespace detail
53
- {
54
- namespace scan_by_key
52
+ namespace detail::scan_by_key
55
53
  {
56
54
  enum class primitive_accum
57
55
  {
@@ -1007,7 +1005,6 @@ struct policy_hub
1007
1005
 
1008
1006
  using MaxPolicy = Policy1000;
1009
1007
  };
1010
- } // namespace scan_by_key
1011
- } // namespace detail
1008
+ } // namespace detail::scan_by_key
1012
1009
 
1013
1010
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace segmented_sort
46
+ namespace detail::segmented_sort
49
47
  {
50
48
 
51
49
  template <typename PolicyT, typename = void>
@@ -395,7 +393,6 @@ struct policy_hub
395
393
 
396
394
  using MaxPolicy = Policy860;
397
395
  };
398
- } // namespace segmented_sort
399
- } // namespace detail
396
+ } // namespace detail::segmented_sort
400
397
 
401
398
  CUB_NAMESPACE_END
@@ -47,9 +47,7 @@
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
50
- namespace detail
51
- {
52
- namespace three_way_partition
50
+ namespace detail::three_way_partition
53
51
  {
54
52
 
55
53
  template <typename PolicyT, typename = void>
@@ -437,7 +435,6 @@ struct policy_hub
437
435
 
438
436
  using MaxPolicy = Policy1000;
439
437
  };
440
- } // namespace three_way_partition
441
- } // namespace detail
438
+ } // namespace detail::three_way_partition
442
439
 
443
440
  CUB_NAMESPACE_END
@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
788
788
  {
789
789
  return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
790
790
  }
791
+
792
+ #if defined(CUB_ENABLE_POLICY_PTX_JSON)
793
+ _CCCL_DEVICE static constexpr auto EncodedPolicy()
794
+ {
795
+ using namespace ptx_json;
796
+ return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
797
+ key<"DelayConstructor">() =
798
+ StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
799
+ }
800
+ #endif
791
801
  };
792
802
 
793
803
  template <typename PolicyT>
@@ -51,6 +51,7 @@
51
51
  #include <cuda/__functional/maximum.h>
52
52
  #include <cuda/__functional/minimum.h>
53
53
  #include <cuda/__ptx/instructions/get_sreg.h>
54
+ #include <cuda/std/__bit/countr.h>
54
55
  #include <cuda/std/__functional/operations.h>
55
56
  #include <cuda/std/__type_traits/enable_if.h>
56
57
  #include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
701
702
  _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
702
703
  {
703
704
  // Get the start flags for each thread in the warp.
704
- int warp_flags = __ballot_sync(member_mask, flag);
705
+ unsigned warp_flags = __ballot_sync(member_mask, flag);
705
706
 
706
707
  // Convert to tail-segmented
707
708
  if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
722
723
  warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
723
724
 
724
725
  // Find the next set flag
725
- int last_lane = __clz(__brev(warp_flags));
726
+ int last_lane = ::cuda::std::countr_zero(warp_flags);
726
727
 
727
728
  T output = input;
728
729
  // Template-iterate reduction steps
@@ -49,6 +49,7 @@
49
49
  #include <cub/util_type.cuh>
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
+ #include <cuda/std/__bit/countr.h>
52
53
  #include <cuda/std/__type_traits/integral_constant.h>
53
54
 
54
55
  CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
215
216
  SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
216
217
  {
217
218
  // Get the start flags for each thread in the warp.
218
- int warp_flags = __ballot_sync(member_mask, flag);
219
+ unsigned warp_flags = __ballot_sync(member_mask, flag);
219
220
 
220
221
  if (!HEAD_SEGMENTED)
221
222
  {
@@ -232,7 +233,7 @@ struct WarpReduceSmem
232
233
  }
233
234
 
234
235
  // Find next flag
235
- int next_flag = __clz(__brev(warp_flags));
236
+ int next_flag = ::cuda::std::countr_zero(warp_flags);
236
237
 
237
238
  // Clip the next segment at the warp boundary if necessary
238
239
  if (LOGICAL_WARP_THREADS != 32)
@@ -50,8 +50,8 @@
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
52
  #include <cuda/std/__algorithm/clamp.h>
53
- #include <cuda/std/__algorithm/max.h>
54
53
  #include <cuda/std/__bit/has_single_bit.h>
54
+ #include <cuda/std/__bit/integral.h>
55
55
  #include <cuda/std/__functional/operations.h>
56
56
  #include <cuda/std/__type_traits/integral_constant.h>
57
57
  #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
630
630
  ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
631
631
 
632
632
  // Find index of first set bit
633
- int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
633
+ int segment_first_lane = ::cuda::std::__bit_log2(ballot);
634
634
 
635
635
  // Iterate scan steps
636
636
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COMMON
12
12
  #define __CUDA___ALGORITHM_COMMON
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COPY_H
12
12
  #define __CUDA___ALGORITHM_COPY_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_FILL
12
12
  #define __CUDA___ALGORITHM_FILL
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header