explodethosebits 0.3.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etb/__init__.py +351 -0
- etb/__init__.pyi +976 -0
- etb/_etb.cpython-39-x86_64-linux-gnu.so +0 -0
- etb/_version.py +34 -0
- etb/py.typed +2 -0
- explodethosebits-0.3.0.dist-info/METADATA +405 -0
- explodethosebits-0.3.0.dist-info/RECORD +88 -0
- explodethosebits-0.3.0.dist-info/WHEEL +6 -0
- explodethosebits-0.3.0.dist-info/licenses/LICENSE +21 -0
- explodethosebits-0.3.0.dist-info/sboms/auditwheel.cdx.json +1 -0
- explodethosebits.libs/libcudart-c3a75b33.so.12.8.90 +0 -0
- include/etb/bit_coordinate.hpp +45 -0
- include/etb/bit_extraction.hpp +79 -0
- include/etb/bit_pruning.hpp +122 -0
- include/etb/config.hpp +284 -0
- include/etb/cuda/arch_optimizations.cuh +358 -0
- include/etb/cuda/blackwell_optimizations.cuh +300 -0
- include/etb/cuda/cuda_common.cuh +265 -0
- include/etb/cuda/etb_cuda.cuh +200 -0
- include/etb/cuda/gpu_memory.cuh +406 -0
- include/etb/cuda/heuristics_kernel.cuh +315 -0
- include/etb/cuda/path_generator_kernel.cuh +272 -0
- include/etb/cuda/prefix_pruner_kernel.cuh +370 -0
- include/etb/cuda/signature_kernel.cuh +328 -0
- include/etb/early_stopping.hpp +246 -0
- include/etb/etb.hpp +20 -0
- include/etb/heuristics.hpp +165 -0
- include/etb/memoization.hpp +285 -0
- include/etb/path.hpp +86 -0
- include/etb/path_count.hpp +87 -0
- include/etb/path_generator.hpp +175 -0
- include/etb/prefix_trie.hpp +339 -0
- include/etb/reporting.hpp +437 -0
- include/etb/scoring.hpp +269 -0
- include/etb/signature.hpp +190 -0
- include/gmock/gmock-actions.h +2297 -0
- include/gmock/gmock-cardinalities.h +159 -0
- include/gmock/gmock-function-mocker.h +518 -0
- include/gmock/gmock-matchers.h +5623 -0
- include/gmock/gmock-more-actions.h +658 -0
- include/gmock/gmock-more-matchers.h +120 -0
- include/gmock/gmock-nice-strict.h +277 -0
- include/gmock/gmock-spec-builders.h +2148 -0
- include/gmock/gmock.h +96 -0
- include/gmock/internal/custom/README.md +18 -0
- include/gmock/internal/custom/gmock-generated-actions.h +7 -0
- include/gmock/internal/custom/gmock-matchers.h +37 -0
- include/gmock/internal/custom/gmock-port.h +40 -0
- include/gmock/internal/gmock-internal-utils.h +487 -0
- include/gmock/internal/gmock-port.h +139 -0
- include/gmock/internal/gmock-pp.h +279 -0
- include/gtest/gtest-assertion-result.h +237 -0
- include/gtest/gtest-death-test.h +345 -0
- include/gtest/gtest-matchers.h +923 -0
- include/gtest/gtest-message.h +252 -0
- include/gtest/gtest-param-test.h +546 -0
- include/gtest/gtest-printers.h +1161 -0
- include/gtest/gtest-spi.h +250 -0
- include/gtest/gtest-test-part.h +192 -0
- include/gtest/gtest-typed-test.h +331 -0
- include/gtest/gtest.h +2321 -0
- include/gtest/gtest_pred_impl.h +279 -0
- include/gtest/gtest_prod.h +60 -0
- include/gtest/internal/custom/README.md +44 -0
- include/gtest/internal/custom/gtest-port.h +37 -0
- include/gtest/internal/custom/gtest-printers.h +42 -0
- include/gtest/internal/custom/gtest.h +37 -0
- include/gtest/internal/gtest-death-test-internal.h +307 -0
- include/gtest/internal/gtest-filepath.h +227 -0
- include/gtest/internal/gtest-internal.h +1560 -0
- include/gtest/internal/gtest-param-util.h +1026 -0
- include/gtest/internal/gtest-port-arch.h +122 -0
- include/gtest/internal/gtest-port.h +2481 -0
- include/gtest/internal/gtest-string.h +178 -0
- include/gtest/internal/gtest-type-util.h +220 -0
- lib/libetb_core.a +0 -0
- lib64/cmake/GTest/GTestConfig.cmake +33 -0
- lib64/cmake/GTest/GTestConfigVersion.cmake +43 -0
- lib64/cmake/GTest/GTestTargets-release.cmake +49 -0
- lib64/cmake/GTest/GTestTargets.cmake +139 -0
- lib64/libgmock.a +0 -0
- lib64/libgmock_main.a +0 -0
- lib64/libgtest.a +0 -0
- lib64/libgtest_main.a +0 -0
- lib64/pkgconfig/gmock.pc +10 -0
- lib64/pkgconfig/gmock_main.pc +10 -0
- lib64/pkgconfig/gtest.pc +9 -0
- lib64/pkgconfig/gtest_main.pc +10 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
#ifndef ETB_ARCH_OPTIMIZATIONS_CUH
|
|
2
|
+
#define ETB_ARCH_OPTIMIZATIONS_CUH
|
|
3
|
+
|
|
4
|
+
#include "cuda_common.cuh"
|
|
5
|
+
|
|
6
|
+
namespace etb {
|
|
7
|
+
namespace cuda {
|
|
8
|
+
|
|
9
|
+
// ============================================================================
|
|
10
|
+
// Architecture Detection and Configuration
|
|
11
|
+
// ============================================================================
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Runtime architecture detection.
|
|
15
|
+
*/
|
|
16
|
+
struct ArchitectureInfo {
|
|
17
|
+
int sm_version;
|
|
18
|
+
bool is_hopper; // SM 90
|
|
19
|
+
bool is_blackwell; // SM 100
|
|
20
|
+
bool has_tensor_cores;
|
|
21
|
+
bool has_async_copy;
|
|
22
|
+
bool has_cluster_launch;
|
|
23
|
+
size_t max_shared_mem;
|
|
24
|
+
int max_threads_per_sm;
|
|
25
|
+
int registers_per_sm;
|
|
26
|
+
|
|
27
|
+
ArchitectureInfo()
|
|
28
|
+
: sm_version(0), is_hopper(false), is_blackwell(false)
|
|
29
|
+
, has_tensor_cores(false), has_async_copy(false)
|
|
30
|
+
, has_cluster_launch(false), max_shared_mem(0)
|
|
31
|
+
, max_threads_per_sm(0), registers_per_sm(0) {}
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Get architecture information for a device.
|
|
36
|
+
*/
|
|
37
|
+
ArchitectureInfo get_architecture_info(int device_id = 0);
|
|
38
|
+
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// Hopper (SM 90) Optimizations
|
|
41
|
+
// ============================================================================
|
|
42
|
+
|
|
43
|
+
namespace hopper {
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Hopper-optimized kernel configuration.
|
|
47
|
+
*/
|
|
48
|
+
struct HopperConfig {
|
|
49
|
+
// Thread block configuration
|
|
50
|
+
static constexpr int THREADS_PER_BLOCK = 256;
|
|
51
|
+
static constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
|
|
52
|
+
|
|
53
|
+
// Shared memory configuration
|
|
54
|
+
static constexpr size_t DEFAULT_SHARED_MEM = 48 * 1024; // 48KB
|
|
55
|
+
static constexpr size_t MAX_SHARED_MEM = 228 * 1024; // 228KB with opt-in
|
|
56
|
+
|
|
57
|
+
// Occupancy targets
|
|
58
|
+
static constexpr int TARGET_BLOCKS_PER_SM = 4;
|
|
59
|
+
static constexpr int REGISTERS_PER_THREAD = 64;
|
|
60
|
+
|
|
61
|
+
// Memory access patterns
|
|
62
|
+
static constexpr int CACHE_LINE_SIZE = 128;
|
|
63
|
+
static constexpr int SECTOR_SIZE = 32;
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Configure kernel for Hopper architecture.
|
|
68
|
+
* Enables extended shared memory if beneficial.
|
|
69
|
+
*/
|
|
70
|
+
template<typename KernelFunc>
|
|
71
|
+
void configure_hopper_kernel(KernelFunc kernel, size_t shared_mem_required) {
|
|
72
|
+
if (shared_mem_required > HopperConfig::DEFAULT_SHARED_MEM) {
|
|
73
|
+
// Request extended shared memory
|
|
74
|
+
cudaFuncSetAttribute(kernel,
|
|
75
|
+
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
|
76
|
+
static_cast<int>(HopperConfig::MAX_SHARED_MEM));
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Set preferred cache configuration
|
|
80
|
+
cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Hopper-optimized memory copy using async copy.
|
|
85
|
+
* Uses cp.async for efficient global to shared memory transfers.
|
|
86
|
+
* Note: For variable-size copies, use the templated version or memcpy fallback.
|
|
87
|
+
*/
|
|
88
|
+
template<size_t BYTES>
|
|
89
|
+
__device__ inline void async_copy_global_to_shared_fixed(
|
|
90
|
+
void* shared_dst,
|
|
91
|
+
const void* global_src
|
|
92
|
+
) {
|
|
93
|
+
#if __CUDA_ARCH__ >= 900
|
|
94
|
+
// Use cp.async for Hopper with compile-time constant size
|
|
95
|
+
static_assert(BYTES == 4 || BYTES == 8 || BYTES == 16,
|
|
96
|
+
"cp.async only supports 4, 8, or 16 byte copies");
|
|
97
|
+
asm volatile(
|
|
98
|
+
"cp.async.ca.shared.global [%0], [%1], %2;\n"
|
|
99
|
+
:
|
|
100
|
+
: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(shared_dst))),
|
|
101
|
+
"l"(global_src),
|
|
102
|
+
"n"(BYTES)
|
|
103
|
+
);
|
|
104
|
+
#else
|
|
105
|
+
// Fallback for older architectures
|
|
106
|
+
memcpy(shared_dst, global_src, BYTES);
|
|
107
|
+
#endif
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Variable-size async copy (uses memcpy fallback for non-constant sizes).
|
|
112
|
+
*/
|
|
113
|
+
__device__ inline void async_copy_global_to_shared(
|
|
114
|
+
void* shared_dst,
|
|
115
|
+
const void* global_src,
|
|
116
|
+
size_t bytes
|
|
117
|
+
) {
|
|
118
|
+
// For variable sizes, use standard memcpy
|
|
119
|
+
// cp.async requires compile-time constant sizes
|
|
120
|
+
memcpy(shared_dst, global_src, bytes);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Commit async copies and wait.
|
|
125
|
+
*/
|
|
126
|
+
__device__ inline void async_copy_commit_and_wait() {
|
|
127
|
+
#if __CUDA_ARCH__ >= 900
|
|
128
|
+
asm volatile("cp.async.commit_group;\n");
|
|
129
|
+
asm volatile("cp.async.wait_group 0;\n");
|
|
130
|
+
#endif
|
|
131
|
+
__syncthreads();
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Hopper-optimized warp-level reduction.
|
|
136
|
+
* Uses warp shuffle with reduced synchronization.
|
|
137
|
+
*/
|
|
138
|
+
template<typename T>
|
|
139
|
+
__device__ inline T hopper_warp_reduce_sum(T val) {
|
|
140
|
+
// Hopper supports efficient warp shuffles
|
|
141
|
+
#pragma unroll
|
|
142
|
+
for (int offset = 16; offset > 0; offset /= 2) {
|
|
143
|
+
val += __shfl_down_sync(0xFFFFFFFF, val, offset);
|
|
144
|
+
}
|
|
145
|
+
return val;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Hopper-optimized block-level reduction.
|
|
150
|
+
*/
|
|
151
|
+
template<typename T>
|
|
152
|
+
__device__ inline T hopper_block_reduce_sum(T val, T* shared_data) {
|
|
153
|
+
const int tid = threadIdx.x;
|
|
154
|
+
const int lane_id = tid % 32;
|
|
155
|
+
const int warp_id = tid / 32;
|
|
156
|
+
|
|
157
|
+
// Warp-level reduction
|
|
158
|
+
val = hopper_warp_reduce_sum(val);
|
|
159
|
+
|
|
160
|
+
// Store warp results
|
|
161
|
+
if (lane_id == 0) {
|
|
162
|
+
shared_data[warp_id] = val;
|
|
163
|
+
}
|
|
164
|
+
__syncthreads();
|
|
165
|
+
|
|
166
|
+
// Final reduction in first warp
|
|
167
|
+
if (warp_id == 0) {
|
|
168
|
+
val = (tid < HopperConfig::WARPS_PER_BLOCK) ? shared_data[tid] : T(0);
|
|
169
|
+
val = hopper_warp_reduce_sum(val);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return val;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
} // namespace hopper
|
|
176
|
+
|
|
177
|
+
// ============================================================================
|
|
178
|
+
// Blackwell (SM 100) Optimizations
|
|
179
|
+
// ============================================================================
|
|
180
|
+
|
|
181
|
+
namespace blackwell {
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Blackwell-optimized kernel configuration.
|
|
185
|
+
*/
|
|
186
|
+
struct BlackwellConfig {
|
|
187
|
+
// Thread block configuration - Blackwell supports larger blocks
|
|
188
|
+
static constexpr int THREADS_PER_BLOCK = 512;
|
|
189
|
+
static constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
|
|
190
|
+
|
|
191
|
+
// Shared memory configuration - Blackwell has more shared memory
|
|
192
|
+
static constexpr size_t DEFAULT_SHARED_MEM = 64 * 1024; // 64KB
|
|
193
|
+
static constexpr size_t MAX_SHARED_MEM = 256 * 1024; // 256KB with opt-in
|
|
194
|
+
|
|
195
|
+
// Occupancy targets
|
|
196
|
+
static constexpr int TARGET_BLOCKS_PER_SM = 2;
|
|
197
|
+
static constexpr int REGISTERS_PER_THREAD = 128;
|
|
198
|
+
|
|
199
|
+
// Memory access patterns
|
|
200
|
+
static constexpr int CACHE_LINE_SIZE = 128;
|
|
201
|
+
static constexpr int SECTOR_SIZE = 32;
|
|
202
|
+
|
|
203
|
+
// Blackwell-specific features
|
|
204
|
+
static constexpr bool HAS_ENHANCED_TENSOR_CORES = true;
|
|
205
|
+
static constexpr bool HAS_IMPROVED_L2_CACHE = true;
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Configure kernel for Blackwell architecture.
|
|
210
|
+
*/
|
|
211
|
+
template<typename KernelFunc>
|
|
212
|
+
void configure_blackwell_kernel(KernelFunc kernel, size_t shared_mem_required) {
|
|
213
|
+
if (shared_mem_required > BlackwellConfig::DEFAULT_SHARED_MEM) {
|
|
214
|
+
cudaFuncSetAttribute(kernel,
|
|
215
|
+
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
|
216
|
+
static_cast<int>(BlackwellConfig::MAX_SHARED_MEM));
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Blackwell benefits from L2 cache preference for read-heavy workloads
|
|
220
|
+
cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferL1);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Blackwell-optimized memory prefetch.
|
|
225
|
+
* Uses improved prefetch instructions.
|
|
226
|
+
*/
|
|
227
|
+
__device__ inline void prefetch_global(const void* ptr) {
|
|
228
|
+
#if __CUDA_ARCH__ >= 1000
|
|
229
|
+
// Blackwell prefetch
|
|
230
|
+
asm volatile("prefetch.global.L2 [%0];\n" : : "l"(ptr));
|
|
231
|
+
#elif __CUDA_ARCH__ >= 900
|
|
232
|
+
// Hopper prefetch
|
|
233
|
+
asm volatile("prefetch.global.L2 [%0];\n" : : "l"(ptr));
|
|
234
|
+
#endif
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Blackwell-optimized warp reduction with larger register file.
|
|
239
|
+
*/
|
|
240
|
+
template<typename T>
|
|
241
|
+
__device__ inline T blackwell_warp_reduce_sum(T val) {
|
|
242
|
+
#pragma unroll
|
|
243
|
+
for (int offset = 16; offset > 0; offset /= 2) {
|
|
244
|
+
val += __shfl_down_sync(0xFFFFFFFF, val, offset);
|
|
245
|
+
}
|
|
246
|
+
return val;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Blackwell-optimized block reduction for larger blocks.
|
|
251
|
+
*/
|
|
252
|
+
template<typename T>
|
|
253
|
+
__device__ inline T blackwell_block_reduce_sum(T val, T* shared_data) {
|
|
254
|
+
const int tid = threadIdx.x;
|
|
255
|
+
const int lane_id = tid % 32;
|
|
256
|
+
const int warp_id = tid / 32;
|
|
257
|
+
|
|
258
|
+
// Warp-level reduction
|
|
259
|
+
val = blackwell_warp_reduce_sum(val);
|
|
260
|
+
|
|
261
|
+
// Store warp results
|
|
262
|
+
if (lane_id == 0) {
|
|
263
|
+
shared_data[warp_id] = val;
|
|
264
|
+
}
|
|
265
|
+
__syncthreads();
|
|
266
|
+
|
|
267
|
+
// Final reduction - Blackwell has more warps per block
|
|
268
|
+
if (warp_id == 0) {
|
|
269
|
+
val = (tid < BlackwellConfig::WARPS_PER_BLOCK) ? shared_data[tid] : T(0);
|
|
270
|
+
val = blackwell_warp_reduce_sum(val);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return val;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Blackwell-optimized histogram using larger shared memory.
|
|
278
|
+
*/
|
|
279
|
+
__device__ inline void blackwell_histogram_add(uint32_t* histogram, uint8_t value) {
|
|
280
|
+
// Blackwell can handle more concurrent atomics efficiently
|
|
281
|
+
atomicAdd(&histogram[value], 1);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
} // namespace blackwell
|
|
285
|
+
|
|
286
|
+
// ============================================================================
|
|
287
|
+
// Architecture-Adaptive Kernel Launch
|
|
288
|
+
// ============================================================================
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Adaptive kernel configuration based on detected architecture.
|
|
292
|
+
*/
|
|
293
|
+
struct AdaptiveKernelConfig {
|
|
294
|
+
int threads_per_block;
|
|
295
|
+
int blocks_per_grid;
|
|
296
|
+
size_t shared_mem_size;
|
|
297
|
+
bool use_async_copy;
|
|
298
|
+
bool use_extended_shared_mem;
|
|
299
|
+
|
|
300
|
+
AdaptiveKernelConfig()
|
|
301
|
+
: threads_per_block(256), blocks_per_grid(1)
|
|
302
|
+
, shared_mem_size(48 * 1024), use_async_copy(false)
|
|
303
|
+
, use_extended_shared_mem(false) {}
|
|
304
|
+
};
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Get adaptive configuration for current device.
|
|
308
|
+
*/
|
|
309
|
+
AdaptiveKernelConfig get_adaptive_config(int device_id, size_t work_items,
|
|
310
|
+
size_t shared_mem_required);
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Architecture-specific kernel launcher.
|
|
314
|
+
*/
|
|
315
|
+
class AdaptiveKernelLauncher {
|
|
316
|
+
public:
|
|
317
|
+
AdaptiveKernelLauncher();
|
|
318
|
+
~AdaptiveKernelLauncher();
|
|
319
|
+
|
|
320
|
+
/**
|
|
321
|
+
* Initialize for a specific device.
|
|
322
|
+
*/
|
|
323
|
+
void initialize(int device_id);
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Get the detected architecture.
|
|
327
|
+
*/
|
|
328
|
+
const ArchitectureInfo& get_arch_info() const { return arch_info_; }
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* Check if Hopper optimizations should be used.
|
|
332
|
+
*/
|
|
333
|
+
bool use_hopper_optimizations() const { return arch_info_.is_hopper; }
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Check if Blackwell optimizations should be used.
|
|
337
|
+
*/
|
|
338
|
+
bool use_blackwell_optimizations() const { return arch_info_.is_blackwell; }
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* Get optimal thread count for current architecture.
|
|
342
|
+
*/
|
|
343
|
+
int get_optimal_threads() const;
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Get optimal shared memory size for current architecture.
|
|
347
|
+
*/
|
|
348
|
+
size_t get_optimal_shared_mem() const;
|
|
349
|
+
|
|
350
|
+
private:
|
|
351
|
+
ArchitectureInfo arch_info_;
|
|
352
|
+
bool initialized_;
|
|
353
|
+
};
|
|
354
|
+
|
|
355
|
+
} // namespace cuda
|
|
356
|
+
} // namespace etb
|
|
357
|
+
|
|
358
|
+
#endif // ETB_ARCH_OPTIMIZATIONS_CUH
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#ifndef ETB_BLACKWELL_OPTIMIZATIONS_CUH
|
|
2
|
+
#define ETB_BLACKWELL_OPTIMIZATIONS_CUH
|
|
3
|
+
|
|
4
|
+
#include "cuda_common.cuh"
|
|
5
|
+
#include "arch_optimizations.cuh"
|
|
6
|
+
|
|
7
|
+
// Note: cooperative_groups.h is not included here to avoid namespace pollution
|
|
8
|
+
// with CCCL on MSVC. The cluster functions below are stubbed out.
|
|
9
|
+
|
|
10
|
+
namespace etb {
|
|
11
|
+
namespace cuda {
|
|
12
|
+
namespace blackwell {
|
|
13
|
+
|
|
14
|
+
// ============================================================================
|
|
15
|
+
// Blackwell (SM 100) Specific Optimizations
|
|
16
|
+
// ============================================================================
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Blackwell memory hierarchy configuration.
|
|
20
|
+
* Blackwell has improved L2 cache and memory bandwidth.
|
|
21
|
+
*/
|
|
22
|
+
struct BlackwellMemoryConfig {
|
|
23
|
+
// L2 cache configuration
|
|
24
|
+
static constexpr size_t L2_CACHE_SIZE = 96 * 1024 * 1024; // 96MB typical
|
|
25
|
+
static constexpr size_t L2_CACHE_LINE = 128;
|
|
26
|
+
|
|
27
|
+
// Memory bandwidth optimization
|
|
28
|
+
static constexpr int COALESCING_WIDTH = 128; // bytes
|
|
29
|
+
static constexpr int OPTIMAL_VECTOR_WIDTH = 4; // float4/int4
|
|
30
|
+
|
|
31
|
+
// Prefetch distances
|
|
32
|
+
static constexpr int PREFETCH_DISTANCE = 4; // cache lines ahead
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Blackwell-optimized path generator configuration.
|
|
37
|
+
*/
|
|
38
|
+
struct BlackwellPathGeneratorConfig {
|
|
39
|
+
// Larger thread blocks for better occupancy
|
|
40
|
+
static constexpr int THREADS_PER_BLOCK = 512;
|
|
41
|
+
static constexpr int PATHS_PER_THREAD = 4;
|
|
42
|
+
|
|
43
|
+
// Work distribution
|
|
44
|
+
static constexpr int WORK_ITEMS_PER_BLOCK = THREADS_PER_BLOCK * PATHS_PER_THREAD;
|
|
45
|
+
|
|
46
|
+
// Shared memory layout
|
|
47
|
+
static constexpr size_t SHARED_PREFIX_CACHE_SIZE = 32 * 1024; // 32KB for prefix cache
|
|
48
|
+
static constexpr size_t SHARED_WORK_QUEUE_SIZE = 16 * 1024; // 16KB for work queue
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Blackwell-optimized heuristics configuration.
|
|
53
|
+
*/
|
|
54
|
+
struct BlackwellHeuristicsConfig {
|
|
55
|
+
// Histogram configuration
|
|
56
|
+
static constexpr int HISTOGRAM_BANKS = 32; // Reduce bank conflicts
|
|
57
|
+
static constexpr int HISTOGRAM_SIZE = 256 * HISTOGRAM_BANKS;
|
|
58
|
+
|
|
59
|
+
// Parallel reduction
|
|
60
|
+
static constexpr int REDUCTION_THREADS = 512;
|
|
61
|
+
static constexpr int REDUCTION_WARPS = REDUCTION_THREADS / 32;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Blackwell-optimized vectorized memory load.
|
|
66
|
+
* Uses float4 for coalesced 128-bit loads.
|
|
67
|
+
*/
|
|
68
|
+
__device__ inline float4 blackwell_load_float4(const float* ptr) {
|
|
69
|
+
return *reinterpret_cast<const float4*>(ptr);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
__device__ inline uint4 blackwell_load_uint4(const uint32_t* ptr) {
|
|
73
|
+
return *reinterpret_cast<const uint4*>(ptr);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Blackwell-optimized vectorized memory store.
|
|
78
|
+
*/
|
|
79
|
+
__device__ inline void blackwell_store_float4(float* ptr, float4 val) {
|
|
80
|
+
*reinterpret_cast<float4*>(ptr) = val;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
__device__ inline void blackwell_store_uint4(uint32_t* ptr, uint4 val) {
|
|
84
|
+
*reinterpret_cast<uint4*>(ptr) = val;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Blackwell L2 cache hint for read-only data.
|
|
89
|
+
*/
|
|
90
|
+
__device__ inline void blackwell_cache_hint_readonly(const void* ptr) {
|
|
91
|
+
#if __CUDA_ARCH__ >= 1000
|
|
92
|
+
asm volatile("prefetch.global.L2::evict_last [%0];\n" : : "l"(ptr));
|
|
93
|
+
#endif
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Blackwell L2 cache hint for streaming data.
|
|
98
|
+
*/
|
|
99
|
+
__device__ inline void blackwell_cache_hint_streaming(const void* ptr) {
|
|
100
|
+
#if __CUDA_ARCH__ >= 1000
|
|
101
|
+
asm volatile("prefetch.global.L2::evict_first [%0];\n" : : "l"(ptr));
|
|
102
|
+
#endif
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Blackwell-optimized histogram with bank conflict avoidance.
|
|
107
|
+
* Uses padding to avoid shared memory bank conflicts.
|
|
108
|
+
*/
|
|
109
|
+
template<int BANKS = 32>
|
|
110
|
+
__device__ inline void blackwell_histogram_add_banked(
|
|
111
|
+
uint32_t* histogram, // Size should be 256 * BANKS
|
|
112
|
+
uint8_t value,
|
|
113
|
+
int thread_id
|
|
114
|
+
) {
|
|
115
|
+
// Each thread uses a different bank based on thread ID
|
|
116
|
+
int bank = thread_id % BANKS;
|
|
117
|
+
atomicAdd(&histogram[value * BANKS + bank], 1);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Reduce banked histogram to final histogram.
|
|
122
|
+
*/
|
|
123
|
+
template<int BANKS = 32>
|
|
124
|
+
__device__ inline void blackwell_histogram_reduce(
|
|
125
|
+
uint32_t* banked_histogram, // Input: 256 * BANKS
|
|
126
|
+
uint32_t* final_histogram, // Output: 256
|
|
127
|
+
int thread_id,
|
|
128
|
+
int block_size
|
|
129
|
+
) {
|
|
130
|
+
// Each thread reduces one or more bins
|
|
131
|
+
for (int bin = thread_id; bin < 256; bin += block_size) {
|
|
132
|
+
uint32_t sum = 0;
|
|
133
|
+
for (int bank = 0; bank < BANKS; ++bank) {
|
|
134
|
+
sum += banked_histogram[bin * BANKS + bank];
|
|
135
|
+
}
|
|
136
|
+
final_histogram[bin] = sum;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Blackwell-optimized parallel entropy calculation.
|
|
142
|
+
* Uses larger thread blocks and vectorized operations.
|
|
143
|
+
*/
|
|
144
|
+
__device__ inline float blackwell_calculate_entropy(
|
|
145
|
+
const uint32_t* histogram,
|
|
146
|
+
uint32_t total,
|
|
147
|
+
float* scratch,
|
|
148
|
+
int tid,
|
|
149
|
+
int block_size
|
|
150
|
+
) {
|
|
151
|
+
// Each thread handles multiple bins
|
|
152
|
+
int bins_per_thread = (256 + block_size - 1) / block_size;
|
|
153
|
+
float local_entropy = 0.0f;
|
|
154
|
+
|
|
155
|
+
#pragma unroll 4
|
|
156
|
+
for (int i = 0; i < bins_per_thread; ++i) {
|
|
157
|
+
int bin = tid * bins_per_thread + i;
|
|
158
|
+
if (bin < 256) {
|
|
159
|
+
uint32_t count = histogram[bin];
|
|
160
|
+
if (count > 0 && total > 0) {
|
|
161
|
+
float p = static_cast<float>(count) / static_cast<float>(total);
|
|
162
|
+
local_entropy -= p * log2f(p);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Block reduction
|
|
168
|
+
scratch[tid] = local_entropy;
|
|
169
|
+
__syncthreads();
|
|
170
|
+
|
|
171
|
+
// Tree reduction with larger stride for 512 threads
|
|
172
|
+
for (int stride = block_size / 2; stride > 0; stride /= 2) {
|
|
173
|
+
if (tid < stride) {
|
|
174
|
+
scratch[tid] += scratch[tid + stride];
|
|
175
|
+
}
|
|
176
|
+
__syncthreads();
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return scratch[0];
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Blackwell-optimized signature matching with vectorized comparison.
|
|
184
|
+
*/
|
|
185
|
+
__device__ inline bool blackwell_signature_match_vectorized(
|
|
186
|
+
const uint8_t* data,
|
|
187
|
+
const uint8_t* signature,
|
|
188
|
+
const uint8_t* mask,
|
|
189
|
+
int length
|
|
190
|
+
) {
|
|
191
|
+
// Process 4 bytes at a time when possible
|
|
192
|
+
int vec_length = length / 4;
|
|
193
|
+
int remainder = length % 4;
|
|
194
|
+
|
|
195
|
+
// Vectorized comparison
|
|
196
|
+
for (int i = 0; i < vec_length; ++i) {
|
|
197
|
+
uint32_t data_vec = *reinterpret_cast<const uint32_t*>(data + i * 4);
|
|
198
|
+
uint32_t sig_vec = *reinterpret_cast<const uint32_t*>(signature + i * 4);
|
|
199
|
+
uint32_t mask_vec = *reinterpret_cast<const uint32_t*>(mask + i * 4);
|
|
200
|
+
|
|
201
|
+
if ((data_vec & mask_vec) != (sig_vec & mask_vec)) {
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Handle remainder
|
|
207
|
+
for (int i = vec_length * 4; i < length; ++i) {
|
|
208
|
+
if ((data[i] & mask[i]) != (signature[i] & mask[i])) {
|
|
209
|
+
return false;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return true;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Blackwell-optimized work stealing with improved atomics.
|
|
218
|
+
*/
|
|
219
|
+
__device__ inline bool blackwell_steal_work(
|
|
220
|
+
uint32_t* work_queue,
|
|
221
|
+
uint32_t* head,
|
|
222
|
+
uint32_t* tail,
|
|
223
|
+
uint32_t queue_capacity,
|
|
224
|
+
uint32_t& work_item
|
|
225
|
+
) {
|
|
226
|
+
// Use atomic exchange for more efficient stealing
|
|
227
|
+
uint32_t old_head = atomicAdd(head, 1);
|
|
228
|
+
uint32_t current_tail = __ldcg(tail); // Cache-global load
|
|
229
|
+
|
|
230
|
+
if (old_head < current_tail) {
|
|
231
|
+
work_item = work_queue[old_head % queue_capacity];
|
|
232
|
+
return true;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Restore head if no work
|
|
236
|
+
atomicSub(head, 1);
|
|
237
|
+
return false;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Blackwell-optimized cooperative group operations.
|
|
242
|
+
* Uses thread block clusters when available.
|
|
243
|
+
* Note: These are stubbed out due to CCCL/MSVC compatibility issues.
|
|
244
|
+
* Full implementation would require cooperative_groups.h.
|
|
245
|
+
*/
|
|
246
|
+
#if __CUDA_ARCH__ >= 1000
|
|
247
|
+
__device__ inline void blackwell_cluster_sync() {
|
|
248
|
+
// Stub - would use cooperative_groups cluster sync
|
|
249
|
+
__syncthreads();
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
__device__ inline int blackwell_cluster_thread_rank() {
|
|
253
|
+
// Stub - returns block-local thread rank
|
|
254
|
+
return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
__device__ inline int blackwell_cluster_size() {
|
|
258
|
+
// Stub - returns block size
|
|
259
|
+
return blockDim.x * blockDim.y * blockDim.z;
|
|
260
|
+
}
|
|
261
|
+
#endif
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Configure kernel for Blackwell with cluster launch.
|
|
265
|
+
*/
|
|
266
|
+
template<typename KernelFunc>
|
|
267
|
+
cudaError_t configure_blackwell_cluster_kernel(
|
|
268
|
+
KernelFunc kernel,
|
|
269
|
+
int cluster_size,
|
|
270
|
+
size_t shared_mem_required
|
|
271
|
+
) {
|
|
272
|
+
cudaError_t err;
|
|
273
|
+
|
|
274
|
+
// Set max dynamic shared memory
|
|
275
|
+
if (shared_mem_required > BlackwellConfig::DEFAULT_SHARED_MEM) {
|
|
276
|
+
err = cudaFuncSetAttribute(kernel,
|
|
277
|
+
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
|
278
|
+
static_cast<int>(BlackwellConfig::MAX_SHARED_MEM));
|
|
279
|
+
if (err != cudaSuccess) return err;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Set cluster dimensions (Blackwell feature)
|
|
283
|
+
#if CUDART_VERSION >= 12000
|
|
284
|
+
cudaFuncAttribute attr = cudaFuncAttributeClusterDimMustBeSet;
|
|
285
|
+
err = cudaFuncSetAttribute(kernel, attr, 1);
|
|
286
|
+
if (err != cudaSuccess) return err;
|
|
287
|
+
|
|
288
|
+
attr = cudaFuncAttributeClusterSchedulingPolicyPreference;
|
|
289
|
+
err = cudaFuncSetAttribute(kernel, attr,
|
|
290
|
+
cudaClusterSchedulingPolicySpread);
|
|
291
|
+
#endif
|
|
292
|
+
|
|
293
|
+
return cudaSuccess;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
} // namespace blackwell
|
|
297
|
+
} // namespace cuda
|
|
298
|
+
} // namespace etb
|
|
299
|
+
|
|
300
|
+
#endif // ETB_BLACKWELL_OPTIMIZATIONS_CUH
|