explodethosebits 0.3.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. etb/__init__.py +351 -0
  2. etb/__init__.pyi +976 -0
  3. etb/_etb.cpython-39-x86_64-linux-gnu.so +0 -0
  4. etb/_version.py +34 -0
  5. etb/py.typed +2 -0
  6. explodethosebits-0.3.0.dist-info/METADATA +405 -0
  7. explodethosebits-0.3.0.dist-info/RECORD +88 -0
  8. explodethosebits-0.3.0.dist-info/WHEEL +6 -0
  9. explodethosebits-0.3.0.dist-info/licenses/LICENSE +21 -0
  10. explodethosebits-0.3.0.dist-info/sboms/auditwheel.cdx.json +1 -0
  11. explodethosebits.libs/libcudart-c3a75b33.so.12.8.90 +0 -0
  12. include/etb/bit_coordinate.hpp +45 -0
  13. include/etb/bit_extraction.hpp +79 -0
  14. include/etb/bit_pruning.hpp +122 -0
  15. include/etb/config.hpp +284 -0
  16. include/etb/cuda/arch_optimizations.cuh +358 -0
  17. include/etb/cuda/blackwell_optimizations.cuh +300 -0
  18. include/etb/cuda/cuda_common.cuh +265 -0
  19. include/etb/cuda/etb_cuda.cuh +200 -0
  20. include/etb/cuda/gpu_memory.cuh +406 -0
  21. include/etb/cuda/heuristics_kernel.cuh +315 -0
  22. include/etb/cuda/path_generator_kernel.cuh +272 -0
  23. include/etb/cuda/prefix_pruner_kernel.cuh +370 -0
  24. include/etb/cuda/signature_kernel.cuh +328 -0
  25. include/etb/early_stopping.hpp +246 -0
  26. include/etb/etb.hpp +20 -0
  27. include/etb/heuristics.hpp +165 -0
  28. include/etb/memoization.hpp +285 -0
  29. include/etb/path.hpp +86 -0
  30. include/etb/path_count.hpp +87 -0
  31. include/etb/path_generator.hpp +175 -0
  32. include/etb/prefix_trie.hpp +339 -0
  33. include/etb/reporting.hpp +437 -0
  34. include/etb/scoring.hpp +269 -0
  35. include/etb/signature.hpp +190 -0
  36. include/gmock/gmock-actions.h +2297 -0
  37. include/gmock/gmock-cardinalities.h +159 -0
  38. include/gmock/gmock-function-mocker.h +518 -0
  39. include/gmock/gmock-matchers.h +5623 -0
  40. include/gmock/gmock-more-actions.h +658 -0
  41. include/gmock/gmock-more-matchers.h +120 -0
  42. include/gmock/gmock-nice-strict.h +277 -0
  43. include/gmock/gmock-spec-builders.h +2148 -0
  44. include/gmock/gmock.h +96 -0
  45. include/gmock/internal/custom/README.md +18 -0
  46. include/gmock/internal/custom/gmock-generated-actions.h +7 -0
  47. include/gmock/internal/custom/gmock-matchers.h +37 -0
  48. include/gmock/internal/custom/gmock-port.h +40 -0
  49. include/gmock/internal/gmock-internal-utils.h +487 -0
  50. include/gmock/internal/gmock-port.h +139 -0
  51. include/gmock/internal/gmock-pp.h +279 -0
  52. include/gtest/gtest-assertion-result.h +237 -0
  53. include/gtest/gtest-death-test.h +345 -0
  54. include/gtest/gtest-matchers.h +923 -0
  55. include/gtest/gtest-message.h +252 -0
  56. include/gtest/gtest-param-test.h +546 -0
  57. include/gtest/gtest-printers.h +1161 -0
  58. include/gtest/gtest-spi.h +250 -0
  59. include/gtest/gtest-test-part.h +192 -0
  60. include/gtest/gtest-typed-test.h +331 -0
  61. include/gtest/gtest.h +2321 -0
  62. include/gtest/gtest_pred_impl.h +279 -0
  63. include/gtest/gtest_prod.h +60 -0
  64. include/gtest/internal/custom/README.md +44 -0
  65. include/gtest/internal/custom/gtest-port.h +37 -0
  66. include/gtest/internal/custom/gtest-printers.h +42 -0
  67. include/gtest/internal/custom/gtest.h +37 -0
  68. include/gtest/internal/gtest-death-test-internal.h +307 -0
  69. include/gtest/internal/gtest-filepath.h +227 -0
  70. include/gtest/internal/gtest-internal.h +1560 -0
  71. include/gtest/internal/gtest-param-util.h +1026 -0
  72. include/gtest/internal/gtest-port-arch.h +122 -0
  73. include/gtest/internal/gtest-port.h +2481 -0
  74. include/gtest/internal/gtest-string.h +178 -0
  75. include/gtest/internal/gtest-type-util.h +220 -0
  76. lib/libetb_core.a +0 -0
  77. lib64/cmake/GTest/GTestConfig.cmake +33 -0
  78. lib64/cmake/GTest/GTestConfigVersion.cmake +43 -0
  79. lib64/cmake/GTest/GTestTargets-release.cmake +49 -0
  80. lib64/cmake/GTest/GTestTargets.cmake +139 -0
  81. lib64/libgmock.a +0 -0
  82. lib64/libgmock_main.a +0 -0
  83. lib64/libgtest.a +0 -0
  84. lib64/libgtest_main.a +0 -0
  85. lib64/pkgconfig/gmock.pc +10 -0
  86. lib64/pkgconfig/gmock_main.pc +10 -0
  87. lib64/pkgconfig/gtest.pc +9 -0
  88. lib64/pkgconfig/gtest_main.pc +10 -0
@@ -0,0 +1,358 @@
1
+ #ifndef ETB_ARCH_OPTIMIZATIONS_CUH
2
+ #define ETB_ARCH_OPTIMIZATIONS_CUH
3
+
4
+ #include "cuda_common.cuh"
5
+
6
+ namespace etb {
7
+ namespace cuda {
8
+
9
+ // ============================================================================
10
+ // Architecture Detection and Configuration
11
+ // ============================================================================
12
+
13
+ /**
14
+ * Runtime architecture detection.
15
+ */
16
+ struct ArchitectureInfo {
17
+ int sm_version;
18
+ bool is_hopper; // SM 90
19
+ bool is_blackwell; // SM 100
20
+ bool has_tensor_cores;
21
+ bool has_async_copy;
22
+ bool has_cluster_launch;
23
+ size_t max_shared_mem;
24
+ int max_threads_per_sm;
25
+ int registers_per_sm;
26
+
27
+ ArchitectureInfo()
28
+ : sm_version(0), is_hopper(false), is_blackwell(false)
29
+ , has_tensor_cores(false), has_async_copy(false)
30
+ , has_cluster_launch(false), max_shared_mem(0)
31
+ , max_threads_per_sm(0), registers_per_sm(0) {}
32
+ };
33
+
34
+ /**
35
+ * Get architecture information for a device.
36
+ */
37
+ ArchitectureInfo get_architecture_info(int device_id = 0);
38
+
39
+ // ============================================================================
40
+ // Hopper (SM 90) Optimizations
41
+ // ============================================================================
42
+
43
+ namespace hopper {
44
+
45
+ /**
46
+ * Hopper-optimized kernel configuration.
47
+ */
48
+ struct HopperConfig {
49
+ // Thread block configuration
50
+ static constexpr int THREADS_PER_BLOCK = 256;
51
+ static constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
52
+
53
+ // Shared memory configuration
54
+ static constexpr size_t DEFAULT_SHARED_MEM = 48 * 1024; // 48KB
55
+ static constexpr size_t MAX_SHARED_MEM = 228 * 1024; // 228KB with opt-in
56
+
57
+ // Occupancy targets
58
+ static constexpr int TARGET_BLOCKS_PER_SM = 4;
59
+ static constexpr int REGISTERS_PER_THREAD = 64;
60
+
61
+ // Memory access patterns
62
+ static constexpr int CACHE_LINE_SIZE = 128;
63
+ static constexpr int SECTOR_SIZE = 32;
64
+ };
65
+
66
+ /**
67
+ * Configure kernel for Hopper architecture.
68
+ * Enables extended shared memory if beneficial.
69
+ */
70
+ template<typename KernelFunc>
71
+ void configure_hopper_kernel(KernelFunc kernel, size_t shared_mem_required) {
72
+ if (shared_mem_required > HopperConfig::DEFAULT_SHARED_MEM) {
73
+ // Request extended shared memory
74
+ cudaFuncSetAttribute(kernel,
75
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
76
+ static_cast<int>(HopperConfig::MAX_SHARED_MEM));
77
+ }
78
+
79
+ // Set preferred cache configuration
80
+ cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
81
+ }
82
+
83
+ /**
84
+ * Hopper-optimized memory copy using async copy.
85
+ * Uses cp.async for efficient global to shared memory transfers.
86
+ * Note: For variable-size copies, use the templated version or memcpy fallback.
87
+ */
88
+ template<size_t BYTES>
89
+ __device__ inline void async_copy_global_to_shared_fixed(
90
+ void* shared_dst,
91
+ const void* global_src
92
+ ) {
93
+ #if __CUDA_ARCH__ >= 900
94
+ // Use cp.async for Hopper with compile-time constant size
95
+ static_assert(BYTES == 4 || BYTES == 8 || BYTES == 16,
96
+ "cp.async only supports 4, 8, or 16 byte copies");
97
+ asm volatile(
98
+ "cp.async.ca.shared.global [%0], [%1], %2;\n"
99
+ :
100
+ : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(shared_dst))),
101
+ "l"(global_src),
102
+ "n"(BYTES)
103
+ );
104
+ #else
105
+ // Fallback for older architectures
106
+ memcpy(shared_dst, global_src, BYTES);
107
+ #endif
108
+ }
109
+
110
+ /**
111
+ * Variable-size async copy (uses memcpy fallback for non-constant sizes).
112
+ */
113
+ __device__ inline void async_copy_global_to_shared(
114
+ void* shared_dst,
115
+ const void* global_src,
116
+ size_t bytes
117
+ ) {
118
+ // For variable sizes, use standard memcpy
119
+ // cp.async requires compile-time constant sizes
120
+ memcpy(shared_dst, global_src, bytes);
121
+ }
122
+
123
+ /**
124
+ * Commit async copies and wait.
125
+ */
126
+ __device__ inline void async_copy_commit_and_wait() {
127
+ #if __CUDA_ARCH__ >= 900
128
+ asm volatile("cp.async.commit_group;\n");
129
+ asm volatile("cp.async.wait_group 0;\n");
130
+ #endif
131
+ __syncthreads();
132
+ }
133
+
134
+ /**
135
+ * Hopper-optimized warp-level reduction.
136
+ * Uses warp shuffle with reduced synchronization.
137
+ */
138
+ template<typename T>
139
+ __device__ inline T hopper_warp_reduce_sum(T val) {
140
+ // Hopper supports efficient warp shuffles
141
+ #pragma unroll
142
+ for (int offset = 16; offset > 0; offset /= 2) {
143
+ val += __shfl_down_sync(0xFFFFFFFF, val, offset);
144
+ }
145
+ return val;
146
+ }
147
+
148
+ /**
149
+ * Hopper-optimized block-level reduction.
150
+ */
151
+ template<typename T>
152
+ __device__ inline T hopper_block_reduce_sum(T val, T* shared_data) {
153
+ const int tid = threadIdx.x;
154
+ const int lane_id = tid % 32;
155
+ const int warp_id = tid / 32;
156
+
157
+ // Warp-level reduction
158
+ val = hopper_warp_reduce_sum(val);
159
+
160
+ // Store warp results
161
+ if (lane_id == 0) {
162
+ shared_data[warp_id] = val;
163
+ }
164
+ __syncthreads();
165
+
166
+ // Final reduction in first warp
167
+ if (warp_id == 0) {
168
+ val = (tid < HopperConfig::WARPS_PER_BLOCK) ? shared_data[tid] : T(0);
169
+ val = hopper_warp_reduce_sum(val);
170
+ }
171
+
172
+ return val;
173
+ }
174
+
175
+ } // namespace hopper
176
+
177
+ // ============================================================================
178
+ // Blackwell (SM 100) Optimizations
179
+ // ============================================================================
180
+
181
+ namespace blackwell {
182
+
183
+ /**
184
+ * Blackwell-optimized kernel configuration.
185
+ */
186
+ struct BlackwellConfig {
187
+ // Thread block configuration - Blackwell supports larger blocks
188
+ static constexpr int THREADS_PER_BLOCK = 512;
189
+ static constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
190
+
191
+ // Shared memory configuration - Blackwell has more shared memory
192
+ static constexpr size_t DEFAULT_SHARED_MEM = 64 * 1024; // 64KB
193
+ static constexpr size_t MAX_SHARED_MEM = 256 * 1024; // 256KB with opt-in
194
+
195
+ // Occupancy targets
196
+ static constexpr int TARGET_BLOCKS_PER_SM = 2;
197
+ static constexpr int REGISTERS_PER_THREAD = 128;
198
+
199
+ // Memory access patterns
200
+ static constexpr int CACHE_LINE_SIZE = 128;
201
+ static constexpr int SECTOR_SIZE = 32;
202
+
203
+ // Blackwell-specific features
204
+ static constexpr bool HAS_ENHANCED_TENSOR_CORES = true;
205
+ static constexpr bool HAS_IMPROVED_L2_CACHE = true;
206
+ };
207
+
208
+ /**
209
+ * Configure kernel for Blackwell architecture.
210
+ */
211
+ template<typename KernelFunc>
212
+ void configure_blackwell_kernel(KernelFunc kernel, size_t shared_mem_required) {
213
+ if (shared_mem_required > BlackwellConfig::DEFAULT_SHARED_MEM) {
214
+ cudaFuncSetAttribute(kernel,
215
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
216
+ static_cast<int>(BlackwellConfig::MAX_SHARED_MEM));
217
+ }
218
+
219
+ // Blackwell benefits from L2 cache preference for read-heavy workloads
220
+ cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferL1);
221
+ }
222
+
223
+ /**
224
+ * Blackwell-optimized memory prefetch.
225
+ * Uses improved prefetch instructions.
226
+ */
227
+ __device__ inline void prefetch_global(const void* ptr) {
228
+ #if __CUDA_ARCH__ >= 1000
229
+ // Blackwell prefetch
230
+ asm volatile("prefetch.global.L2 [%0];\n" : : "l"(ptr));
231
+ #elif __CUDA_ARCH__ >= 900
232
+ // Hopper prefetch
233
+ asm volatile("prefetch.global.L2 [%0];\n" : : "l"(ptr));
234
+ #endif
235
+ }
236
+
237
+ /**
238
+ * Blackwell-optimized warp reduction with larger register file.
239
+ */
240
+ template<typename T>
241
+ __device__ inline T blackwell_warp_reduce_sum(T val) {
242
+ #pragma unroll
243
+ for (int offset = 16; offset > 0; offset /= 2) {
244
+ val += __shfl_down_sync(0xFFFFFFFF, val, offset);
245
+ }
246
+ return val;
247
+ }
248
+
249
+ /**
250
+ * Blackwell-optimized block reduction for larger blocks.
251
+ */
252
+ template<typename T>
253
+ __device__ inline T blackwell_block_reduce_sum(T val, T* shared_data) {
254
+ const int tid = threadIdx.x;
255
+ const int lane_id = tid % 32;
256
+ const int warp_id = tid / 32;
257
+
258
+ // Warp-level reduction
259
+ val = blackwell_warp_reduce_sum(val);
260
+
261
+ // Store warp results
262
+ if (lane_id == 0) {
263
+ shared_data[warp_id] = val;
264
+ }
265
+ __syncthreads();
266
+
267
+ // Final reduction - Blackwell has more warps per block
268
+ if (warp_id == 0) {
269
+ val = (tid < BlackwellConfig::WARPS_PER_BLOCK) ? shared_data[tid] : T(0);
270
+ val = blackwell_warp_reduce_sum(val);
271
+ }
272
+
273
+ return val;
274
+ }
275
+
276
+ /**
277
+ * Blackwell-optimized histogram using larger shared memory.
278
+ */
279
+ __device__ inline void blackwell_histogram_add(uint32_t* histogram, uint8_t value) {
280
+ // Blackwell can handle more concurrent atomics efficiently
281
+ atomicAdd(&histogram[value], 1);
282
+ }
283
+
284
+ } // namespace blackwell
285
+
286
+ // ============================================================================
287
+ // Architecture-Adaptive Kernel Launch
288
+ // ============================================================================
289
+
290
+ /**
291
+ * Adaptive kernel configuration based on detected architecture.
292
+ */
293
+ struct AdaptiveKernelConfig {
294
+ int threads_per_block;
295
+ int blocks_per_grid;
296
+ size_t shared_mem_size;
297
+ bool use_async_copy;
298
+ bool use_extended_shared_mem;
299
+
300
+ AdaptiveKernelConfig()
301
+ : threads_per_block(256), blocks_per_grid(1)
302
+ , shared_mem_size(48 * 1024), use_async_copy(false)
303
+ , use_extended_shared_mem(false) {}
304
+ };
305
+
306
+ /**
307
+ * Get adaptive configuration for current device.
308
+ */
309
+ AdaptiveKernelConfig get_adaptive_config(int device_id, size_t work_items,
310
+ size_t shared_mem_required);
311
+
312
+ /**
313
+ * Architecture-specific kernel launcher.
314
+ */
315
+ class AdaptiveKernelLauncher {
316
+ public:
317
+ AdaptiveKernelLauncher();
318
+ ~AdaptiveKernelLauncher();
319
+
320
+ /**
321
+ * Initialize for a specific device.
322
+ */
323
+ void initialize(int device_id);
324
+
325
+ /**
326
+ * Get the detected architecture.
327
+ */
328
+ const ArchitectureInfo& get_arch_info() const { return arch_info_; }
329
+
330
+ /**
331
+ * Check if Hopper optimizations should be used.
332
+ */
333
+ bool use_hopper_optimizations() const { return arch_info_.is_hopper; }
334
+
335
+ /**
336
+ * Check if Blackwell optimizations should be used.
337
+ */
338
+ bool use_blackwell_optimizations() const { return arch_info_.is_blackwell; }
339
+
340
+ /**
341
+ * Get optimal thread count for current architecture.
342
+ */
343
+ int get_optimal_threads() const;
344
+
345
+ /**
346
+ * Get optimal shared memory size for current architecture.
347
+ */
348
+ size_t get_optimal_shared_mem() const;
349
+
350
+ private:
351
+ ArchitectureInfo arch_info_;
352
+ bool initialized_;
353
+ };
354
+
355
+ } // namespace cuda
356
+ } // namespace etb
357
+
358
+ #endif // ETB_ARCH_OPTIMIZATIONS_CUH
@@ -0,0 +1,300 @@
1
+ #ifndef ETB_BLACKWELL_OPTIMIZATIONS_CUH
2
+ #define ETB_BLACKWELL_OPTIMIZATIONS_CUH
3
+
4
+ #include "cuda_common.cuh"
5
+ #include "arch_optimizations.cuh"
6
+
7
+ // Note: cooperative_groups.h is not included here to avoid namespace pollution
8
+ // with CCCL on MSVC. The cluster functions below are stubbed out.
9
+
10
+ namespace etb {
11
+ namespace cuda {
12
+ namespace blackwell {
13
+
14
+ // ============================================================================
15
+ // Blackwell (SM 100) Specific Optimizations
16
+ // ============================================================================
17
+
18
+ /**
19
+ * Blackwell memory hierarchy configuration.
20
+ * Blackwell has improved L2 cache and memory bandwidth.
21
+ */
22
+ struct BlackwellMemoryConfig {
23
+ // L2 cache configuration
24
+ static constexpr size_t L2_CACHE_SIZE = 96 * 1024 * 1024; // 96MB typical
25
+ static constexpr size_t L2_CACHE_LINE = 128;
26
+
27
+ // Memory bandwidth optimization
28
+ static constexpr int COALESCING_WIDTH = 128; // bytes
29
+ static constexpr int OPTIMAL_VECTOR_WIDTH = 4; // float4/int4
30
+
31
+ // Prefetch distances
32
+ static constexpr int PREFETCH_DISTANCE = 4; // cache lines ahead
33
+ };
34
+
35
+ /**
36
+ * Blackwell-optimized path generator configuration.
37
+ */
38
+ struct BlackwellPathGeneratorConfig {
39
+ // Larger thread blocks for better occupancy
40
+ static constexpr int THREADS_PER_BLOCK = 512;
41
+ static constexpr int PATHS_PER_THREAD = 4;
42
+
43
+ // Work distribution
44
+ static constexpr int WORK_ITEMS_PER_BLOCK = THREADS_PER_BLOCK * PATHS_PER_THREAD;
45
+
46
+ // Shared memory layout
47
+ static constexpr size_t SHARED_PREFIX_CACHE_SIZE = 32 * 1024; // 32KB for prefix cache
48
+ static constexpr size_t SHARED_WORK_QUEUE_SIZE = 16 * 1024; // 16KB for work queue
49
+ };
50
+
51
+ /**
52
+ * Blackwell-optimized heuristics configuration.
53
+ */
54
+ struct BlackwellHeuristicsConfig {
55
+ // Histogram configuration
56
+ static constexpr int HISTOGRAM_BANKS = 32; // Reduce bank conflicts
57
+ static constexpr int HISTOGRAM_SIZE = 256 * HISTOGRAM_BANKS;
58
+
59
+ // Parallel reduction
60
+ static constexpr int REDUCTION_THREADS = 512;
61
+ static constexpr int REDUCTION_WARPS = REDUCTION_THREADS / 32;
62
+ };
63
+
64
+ /**
65
+ * Blackwell-optimized vectorized memory load.
66
+ * Uses float4 for coalesced 128-bit loads.
67
+ */
68
+ __device__ inline float4 blackwell_load_float4(const float* ptr) {
69
+ return *reinterpret_cast<const float4*>(ptr);
70
+ }
71
+
72
+ __device__ inline uint4 blackwell_load_uint4(const uint32_t* ptr) {
73
+ return *reinterpret_cast<const uint4*>(ptr);
74
+ }
75
+
76
+ /**
77
+ * Blackwell-optimized vectorized memory store.
78
+ */
79
+ __device__ inline void blackwell_store_float4(float* ptr, float4 val) {
80
+ *reinterpret_cast<float4*>(ptr) = val;
81
+ }
82
+
83
+ __device__ inline void blackwell_store_uint4(uint32_t* ptr, uint4 val) {
84
+ *reinterpret_cast<uint4*>(ptr) = val;
85
+ }
86
+
87
+ /**
88
+ * Blackwell L2 cache hint for read-only data.
89
+ */
90
+ __device__ inline void blackwell_cache_hint_readonly(const void* ptr) {
91
+ #if __CUDA_ARCH__ >= 1000
92
+ asm volatile("prefetch.global.L2::evict_last [%0];\n" : : "l"(ptr));
93
+ #endif
94
+ }
95
+
96
+ /**
97
+ * Blackwell L2 cache hint for streaming data.
98
+ */
99
+ __device__ inline void blackwell_cache_hint_streaming(const void* ptr) {
100
+ #if __CUDA_ARCH__ >= 1000
101
+ asm volatile("prefetch.global.L2::evict_first [%0];\n" : : "l"(ptr));
102
+ #endif
103
+ }
104
+
105
+ /**
106
+ * Blackwell-optimized histogram with bank conflict avoidance.
107
+ * Uses padding to avoid shared memory bank conflicts.
108
+ */
109
+ template<int BANKS = 32>
110
+ __device__ inline void blackwell_histogram_add_banked(
111
+ uint32_t* histogram, // Size should be 256 * BANKS
112
+ uint8_t value,
113
+ int thread_id
114
+ ) {
115
+ // Each thread uses a different bank based on thread ID
116
+ int bank = thread_id % BANKS;
117
+ atomicAdd(&histogram[value * BANKS + bank], 1);
118
+ }
119
+
120
+ /**
121
+ * Reduce banked histogram to final histogram.
122
+ */
123
+ template<int BANKS = 32>
124
+ __device__ inline void blackwell_histogram_reduce(
125
+ uint32_t* banked_histogram, // Input: 256 * BANKS
126
+ uint32_t* final_histogram, // Output: 256
127
+ int thread_id,
128
+ int block_size
129
+ ) {
130
+ // Each thread reduces one or more bins
131
+ for (int bin = thread_id; bin < 256; bin += block_size) {
132
+ uint32_t sum = 0;
133
+ for (int bank = 0; bank < BANKS; ++bank) {
134
+ sum += banked_histogram[bin * BANKS + bank];
135
+ }
136
+ final_histogram[bin] = sum;
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Blackwell-optimized parallel entropy calculation.
142
+ * Uses larger thread blocks and vectorized operations.
143
+ */
144
+ __device__ inline float blackwell_calculate_entropy(
145
+ const uint32_t* histogram,
146
+ uint32_t total,
147
+ float* scratch,
148
+ int tid,
149
+ int block_size
150
+ ) {
151
+ // Each thread handles multiple bins
152
+ int bins_per_thread = (256 + block_size - 1) / block_size;
153
+ float local_entropy = 0.0f;
154
+
155
+ #pragma unroll 4
156
+ for (int i = 0; i < bins_per_thread; ++i) {
157
+ int bin = tid * bins_per_thread + i;
158
+ if (bin < 256) {
159
+ uint32_t count = histogram[bin];
160
+ if (count > 0 && total > 0) {
161
+ float p = static_cast<float>(count) / static_cast<float>(total);
162
+ local_entropy -= p * log2f(p);
163
+ }
164
+ }
165
+ }
166
+
167
+ // Block reduction
168
+ scratch[tid] = local_entropy;
169
+ __syncthreads();
170
+
171
+ // Tree reduction with larger stride for 512 threads
172
+ for (int stride = block_size / 2; stride > 0; stride /= 2) {
173
+ if (tid < stride) {
174
+ scratch[tid] += scratch[tid + stride];
175
+ }
176
+ __syncthreads();
177
+ }
178
+
179
+ return scratch[0];
180
+ }
181
+
182
+ /**
183
+ * Blackwell-optimized signature matching with vectorized comparison.
184
+ */
185
+ __device__ inline bool blackwell_signature_match_vectorized(
186
+ const uint8_t* data,
187
+ const uint8_t* signature,
188
+ const uint8_t* mask,
189
+ int length
190
+ ) {
191
+ // Process 4 bytes at a time when possible
192
+ int vec_length = length / 4;
193
+ int remainder = length % 4;
194
+
195
+ // Vectorized comparison
196
+ for (int i = 0; i < vec_length; ++i) {
197
+ uint32_t data_vec = *reinterpret_cast<const uint32_t*>(data + i * 4);
198
+ uint32_t sig_vec = *reinterpret_cast<const uint32_t*>(signature + i * 4);
199
+ uint32_t mask_vec = *reinterpret_cast<const uint32_t*>(mask + i * 4);
200
+
201
+ if ((data_vec & mask_vec) != (sig_vec & mask_vec)) {
202
+ return false;
203
+ }
204
+ }
205
+
206
+ // Handle remainder
207
+ for (int i = vec_length * 4; i < length; ++i) {
208
+ if ((data[i] & mask[i]) != (signature[i] & mask[i])) {
209
+ return false;
210
+ }
211
+ }
212
+
213
+ return true;
214
+ }
215
+
216
+ /**
217
+ * Blackwell-optimized work stealing with improved atomics.
218
+ */
219
+ __device__ inline bool blackwell_steal_work(
220
+ uint32_t* work_queue,
221
+ uint32_t* head,
222
+ uint32_t* tail,
223
+ uint32_t queue_capacity,
224
+ uint32_t& work_item
225
+ ) {
226
+ // Use atomic exchange for more efficient stealing
227
+ uint32_t old_head = atomicAdd(head, 1);
228
+ uint32_t current_tail = __ldcg(tail); // Cache-global load
229
+
230
+ if (old_head < current_tail) {
231
+ work_item = work_queue[old_head % queue_capacity];
232
+ return true;
233
+ }
234
+
235
+ // Restore head if no work
236
+ atomicSub(head, 1);
237
+ return false;
238
+ }
239
+
240
+ /**
241
+ * Blackwell-optimized cooperative group operations.
242
+ * Uses thread block clusters when available.
243
+ * Note: These are stubbed out due to CCCL/MSVC compatibility issues.
244
+ * Full implementation would require cooperative_groups.h.
245
+ */
246
+ #if __CUDA_ARCH__ >= 1000
247
+ __device__ inline void blackwell_cluster_sync() {
248
+ // Stub - would use cooperative_groups cluster sync
249
+ __syncthreads();
250
+ }
251
+
252
+ __device__ inline int blackwell_cluster_thread_rank() {
253
+ // Stub - returns block-local thread rank
254
+ return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
255
+ }
256
+
257
+ __device__ inline int blackwell_cluster_size() {
258
+ // Stub - returns block size
259
+ return blockDim.x * blockDim.y * blockDim.z;
260
+ }
261
+ #endif
262
+
263
+ /**
264
+ * Configure kernel for Blackwell with cluster launch.
265
+ */
266
+ template<typename KernelFunc>
267
+ cudaError_t configure_blackwell_cluster_kernel(
268
+ KernelFunc kernel,
269
+ int cluster_size,
270
+ size_t shared_mem_required
271
+ ) {
272
+ cudaError_t err;
273
+
274
+ // Set max dynamic shared memory
275
+ if (shared_mem_required > BlackwellConfig::DEFAULT_SHARED_MEM) {
276
+ err = cudaFuncSetAttribute(kernel,
277
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
278
+ static_cast<int>(BlackwellConfig::MAX_SHARED_MEM));
279
+ if (err != cudaSuccess) return err;
280
+ }
281
+
282
+ // Set cluster dimensions (Blackwell feature)
283
+ #if CUDART_VERSION >= 12000
284
+ cudaFuncAttribute attr = cudaFuncAttributeClusterDimMustBeSet;
285
+ err = cudaFuncSetAttribute(kernel, attr, 1);
286
+ if (err != cudaSuccess) return err;
287
+
288
+ attr = cudaFuncAttributeClusterSchedulingPolicyPreference;
289
+ err = cudaFuncSetAttribute(kernel, attr,
290
+ cudaClusterSchedulingPolicySpread);
291
+ #endif
292
+
293
+ return cudaSuccess;
294
+ }
295
+
296
+ } // namespace blackwell
297
+ } // namespace cuda
298
+ } // namespace etb
299
+
300
+ #endif // ETB_BLACKWELL_OPTIMIZATIONS_CUH