explodethosebits 0.3.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. etb/__init__.py +351 -0
  2. etb/__init__.pyi +976 -0
  3. etb/_etb.cpython-39-x86_64-linux-gnu.so +0 -0
  4. etb/_version.py +34 -0
  5. etb/py.typed +2 -0
  6. explodethosebits-0.3.0.dist-info/METADATA +405 -0
  7. explodethosebits-0.3.0.dist-info/RECORD +88 -0
  8. explodethosebits-0.3.0.dist-info/WHEEL +6 -0
  9. explodethosebits-0.3.0.dist-info/licenses/LICENSE +21 -0
  10. explodethosebits-0.3.0.dist-info/sboms/auditwheel.cdx.json +1 -0
  11. explodethosebits.libs/libcudart-c3a75b33.so.12.8.90 +0 -0
  12. include/etb/bit_coordinate.hpp +45 -0
  13. include/etb/bit_extraction.hpp +79 -0
  14. include/etb/bit_pruning.hpp +122 -0
  15. include/etb/config.hpp +284 -0
  16. include/etb/cuda/arch_optimizations.cuh +358 -0
  17. include/etb/cuda/blackwell_optimizations.cuh +300 -0
  18. include/etb/cuda/cuda_common.cuh +265 -0
  19. include/etb/cuda/etb_cuda.cuh +200 -0
  20. include/etb/cuda/gpu_memory.cuh +406 -0
  21. include/etb/cuda/heuristics_kernel.cuh +315 -0
  22. include/etb/cuda/path_generator_kernel.cuh +272 -0
  23. include/etb/cuda/prefix_pruner_kernel.cuh +370 -0
  24. include/etb/cuda/signature_kernel.cuh +328 -0
  25. include/etb/early_stopping.hpp +246 -0
  26. include/etb/etb.hpp +20 -0
  27. include/etb/heuristics.hpp +165 -0
  28. include/etb/memoization.hpp +285 -0
  29. include/etb/path.hpp +86 -0
  30. include/etb/path_count.hpp +87 -0
  31. include/etb/path_generator.hpp +175 -0
  32. include/etb/prefix_trie.hpp +339 -0
  33. include/etb/reporting.hpp +437 -0
  34. include/etb/scoring.hpp +269 -0
  35. include/etb/signature.hpp +190 -0
  36. include/gmock/gmock-actions.h +2297 -0
  37. include/gmock/gmock-cardinalities.h +159 -0
  38. include/gmock/gmock-function-mocker.h +518 -0
  39. include/gmock/gmock-matchers.h +5623 -0
  40. include/gmock/gmock-more-actions.h +658 -0
  41. include/gmock/gmock-more-matchers.h +120 -0
  42. include/gmock/gmock-nice-strict.h +277 -0
  43. include/gmock/gmock-spec-builders.h +2148 -0
  44. include/gmock/gmock.h +96 -0
  45. include/gmock/internal/custom/README.md +18 -0
  46. include/gmock/internal/custom/gmock-generated-actions.h +7 -0
  47. include/gmock/internal/custom/gmock-matchers.h +37 -0
  48. include/gmock/internal/custom/gmock-port.h +40 -0
  49. include/gmock/internal/gmock-internal-utils.h +487 -0
  50. include/gmock/internal/gmock-port.h +139 -0
  51. include/gmock/internal/gmock-pp.h +279 -0
  52. include/gtest/gtest-assertion-result.h +237 -0
  53. include/gtest/gtest-death-test.h +345 -0
  54. include/gtest/gtest-matchers.h +923 -0
  55. include/gtest/gtest-message.h +252 -0
  56. include/gtest/gtest-param-test.h +546 -0
  57. include/gtest/gtest-printers.h +1161 -0
  58. include/gtest/gtest-spi.h +250 -0
  59. include/gtest/gtest-test-part.h +192 -0
  60. include/gtest/gtest-typed-test.h +331 -0
  61. include/gtest/gtest.h +2321 -0
  62. include/gtest/gtest_pred_impl.h +279 -0
  63. include/gtest/gtest_prod.h +60 -0
  64. include/gtest/internal/custom/README.md +44 -0
  65. include/gtest/internal/custom/gtest-port.h +37 -0
  66. include/gtest/internal/custom/gtest-printers.h +42 -0
  67. include/gtest/internal/custom/gtest.h +37 -0
  68. include/gtest/internal/gtest-death-test-internal.h +307 -0
  69. include/gtest/internal/gtest-filepath.h +227 -0
  70. include/gtest/internal/gtest-internal.h +1560 -0
  71. include/gtest/internal/gtest-param-util.h +1026 -0
  72. include/gtest/internal/gtest-port-arch.h +122 -0
  73. include/gtest/internal/gtest-port.h +2481 -0
  74. include/gtest/internal/gtest-string.h +178 -0
  75. include/gtest/internal/gtest-type-util.h +220 -0
  76. lib/libetb_core.a +0 -0
  77. lib64/cmake/GTest/GTestConfig.cmake +33 -0
  78. lib64/cmake/GTest/GTestConfigVersion.cmake +43 -0
  79. lib64/cmake/GTest/GTestTargets-release.cmake +49 -0
  80. lib64/cmake/GTest/GTestTargets.cmake +139 -0
  81. lib64/libgmock.a +0 -0
  82. lib64/libgmock_main.a +0 -0
  83. lib64/libgtest.a +0 -0
  84. lib64/libgtest_main.a +0 -0
  85. lib64/pkgconfig/gmock.pc +10 -0
  86. lib64/pkgconfig/gmock_main.pc +10 -0
  87. lib64/pkgconfig/gtest.pc +9 -0
  88. lib64/pkgconfig/gtest_main.pc +10 -0
@@ -0,0 +1,406 @@
1
+ #ifndef ETB_GPU_MEMORY_CUH
2
+ #define ETB_GPU_MEMORY_CUH
3
+
4
+ #include "cuda_common.cuh"
5
+ #include <cstring>
6
+ #include <memory>
7
+ #include <vector>
8
+
9
+ namespace etb {
10
+ namespace cuda {
11
+
12
+ /**
13
+ * RAII wrapper for pinned (page-locked) host memory.
14
+ * Provides faster host-to-device transfers.
15
+ */
16
+ template<typename T>
17
+ class PinnedBuffer {
18
+ public:
19
+ PinnedBuffer() : data_(nullptr), size_(0), capacity_(0) {}
20
+
21
+ explicit PinnedBuffer(size_t count) : data_(nullptr), size_(0), capacity_(0) {
22
+ allocate(count);
23
+ }
24
+
25
+ ~PinnedBuffer() {
26
+ free();
27
+ }
28
+
29
+ // Non-copyable
30
+ PinnedBuffer(const PinnedBuffer&) = delete;
31
+ PinnedBuffer& operator=(const PinnedBuffer&) = delete;
32
+
33
+ // Movable
34
+ PinnedBuffer(PinnedBuffer&& other) noexcept
35
+ : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
36
+ other.data_ = nullptr;
37
+ other.size_ = 0;
38
+ other.capacity_ = 0;
39
+ }
40
+
41
+ PinnedBuffer& operator=(PinnedBuffer&& other) noexcept {
42
+ if (this != &other) {
43
+ free();
44
+ data_ = other.data_;
45
+ size_ = other.size_;
46
+ capacity_ = other.capacity_;
47
+ other.data_ = nullptr;
48
+ other.size_ = 0;
49
+ other.capacity_ = 0;
50
+ }
51
+ return *this;
52
+ }
53
+
54
+ void allocate(size_t count) {
55
+ if (count > capacity_) {
56
+ free();
57
+ ETB_CUDA_CHECK(cudaMallocHost(&data_, count * sizeof(T)));
58
+ capacity_ = count;
59
+ }
60
+ size_ = count;
61
+ }
62
+
63
+ void free() {
64
+ if (data_) {
65
+ cudaFreeHost(data_);
66
+ data_ = nullptr;
67
+ size_ = 0;
68
+ capacity_ = 0;
69
+ }
70
+ }
71
+
72
+ T* data() { return data_; }
73
+ const T* data() const { return data_; }
74
+ size_t size() const { return size_; }
75
+ size_t capacity() const { return capacity_; }
76
+ size_t bytes() const { return size_ * sizeof(T); }
77
+ bool empty() const { return size_ == 0; }
78
+
79
+ T& operator[](size_t index) { return data_[index]; }
80
+ const T& operator[](size_t index) const { return data_[index]; }
81
+
82
+ // Copy from host vector
83
+ void copy_from(const std::vector<T>& src) {
84
+ allocate(src.size());
85
+ std::memcpy(data_, src.data(), src.size() * sizeof(T));
86
+ }
87
+
88
+ // Copy to host vector
89
+ std::vector<T> to_vector() const {
90
+ return std::vector<T>(data_, data_ + size_);
91
+ }
92
+
93
+ private:
94
+ T* data_;
95
+ size_t size_;
96
+ size_t capacity_;
97
+ };
98
+
99
+ /**
100
+ * RAII wrapper for device (GPU) memory.
101
+ */
102
+ template<typename T>
103
+ class DeviceBuffer {
104
+ public:
105
+ DeviceBuffer() : data_(nullptr), size_(0), capacity_(0) {}
106
+
107
+ explicit DeviceBuffer(size_t count) : data_(nullptr), size_(0), capacity_(0) {
108
+ allocate(count);
109
+ }
110
+
111
+ ~DeviceBuffer() {
112
+ free();
113
+ }
114
+
115
+ // Non-copyable
116
+ DeviceBuffer(const DeviceBuffer&) = delete;
117
+ DeviceBuffer& operator=(const DeviceBuffer&) = delete;
118
+
119
+ // Movable
120
+ DeviceBuffer(DeviceBuffer&& other) noexcept
121
+ : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
122
+ other.data_ = nullptr;
123
+ other.size_ = 0;
124
+ other.capacity_ = 0;
125
+ }
126
+
127
+ DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
128
+ if (this != &other) {
129
+ free();
130
+ data_ = other.data_;
131
+ size_ = other.size_;
132
+ capacity_ = other.capacity_;
133
+ other.data_ = nullptr;
134
+ other.size_ = 0;
135
+ other.capacity_ = 0;
136
+ }
137
+ return *this;
138
+ }
139
+
140
+ void allocate(size_t count) {
141
+ if (count > capacity_) {
142
+ free();
143
+ ETB_CUDA_CHECK(cudaMalloc(&data_, count * sizeof(T)));
144
+ capacity_ = count;
145
+ }
146
+ size_ = count;
147
+ }
148
+
149
+ void free() {
150
+ if (data_) {
151
+ cudaFree(data_);
152
+ data_ = nullptr;
153
+ size_ = 0;
154
+ capacity_ = 0;
155
+ }
156
+ }
157
+
158
+ void clear() {
159
+ if (data_ && size_ > 0) {
160
+ ETB_CUDA_CHECK(cudaMemset(data_, 0, size_ * sizeof(T)));
161
+ }
162
+ }
163
+
164
+ T* data() { return data_; }
165
+ const T* data() const { return data_; }
166
+ size_t size() const { return size_; }
167
+ size_t capacity() const { return capacity_; }
168
+ size_t bytes() const { return size_ * sizeof(T); }
169
+ bool empty() const { return size_ == 0; }
170
+
171
+ // Copy from host
172
+ void copy_from_host(const T* src, size_t count) {
173
+ allocate(count);
174
+ ETB_CUDA_CHECK(cudaMemcpy(data_, src, count * sizeof(T), cudaMemcpyHostToDevice));
175
+ }
176
+
177
+ void copy_from_host(const std::vector<T>& src) {
178
+ copy_from_host(src.data(), src.size());
179
+ }
180
+
181
+ void copy_from_host(const PinnedBuffer<T>& src) {
182
+ copy_from_host(src.data(), src.size());
183
+ }
184
+
185
+ // Async copy from pinned host memory
186
+ void copy_from_host_async(const PinnedBuffer<T>& src, cudaStream_t stream) {
187
+ allocate(src.size());
188
+ ETB_CUDA_CHECK(cudaMemcpyAsync(data_, src.data(), src.bytes(),
189
+ cudaMemcpyHostToDevice, stream));
190
+ }
191
+
192
+ // Copy to host
193
+ void copy_to_host(T* dst, size_t count) const {
194
+ ETB_CUDA_CHECK(cudaMemcpy(dst, data_, count * sizeof(T), cudaMemcpyDeviceToHost));
195
+ }
196
+
197
+ void copy_to_host(std::vector<T>& dst) const {
198
+ dst.resize(size_);
199
+ copy_to_host(dst.data(), size_);
200
+ }
201
+
202
+ void copy_to_host(PinnedBuffer<T>& dst) const {
203
+ dst.allocate(size_);
204
+ copy_to_host(dst.data(), size_);
205
+ }
206
+
207
+ // Async copy to pinned host memory
208
+ void copy_to_host_async(PinnedBuffer<T>& dst, cudaStream_t stream) const {
209
+ dst.allocate(size_);
210
+ ETB_CUDA_CHECK(cudaMemcpyAsync(dst.data(), data_, bytes(),
211
+ cudaMemcpyDeviceToHost, stream));
212
+ }
213
+
214
+ private:
215
+ T* data_;
216
+ size_t size_;
217
+ size_t capacity_;
218
+ };
219
+
220
+ /**
221
+ * GPU memory manager for the ETB library.
222
+ * Handles allocation and management of all GPU memory resources.
223
+ *
224
+ * Requirements: 9.4, 9.5, 9.6
225
+ */
226
+ class GPUMemoryManager {
227
+ public:
228
+ /**
229
+ * Configuration for GPU memory allocation.
230
+ */
231
+ struct Config {
232
+ size_t max_input_size; // Maximum input buffer size
233
+ size_t prefix_trie_capacity; // Number of trie nodes
234
+ size_t candidate_queue_capacity; // Number of candidates to track
235
+ size_t work_queue_capacity; // Work items for path generation
236
+ int num_streams; // Number of CUDA streams
237
+
238
+ Config()
239
+ : max_input_size(1024 * 1024) // 1MB default
240
+ , prefix_trie_capacity(65536)
241
+ , candidate_queue_capacity(1024)
242
+ , work_queue_capacity(65536)
243
+ , num_streams(4) {}
244
+ };
245
+
246
+ GPUMemoryManager();
247
+ explicit GPUMemoryManager(const Config& config);
248
+ ~GPUMemoryManager();
249
+
250
+ // Non-copyable, non-movable
251
+ GPUMemoryManager(const GPUMemoryManager&) = delete;
252
+ GPUMemoryManager& operator=(const GPUMemoryManager&) = delete;
253
+
254
+ /**
255
+ * Initialize GPU memory with the given configuration.
256
+ * @param config Memory configuration
257
+ * @return true if initialization succeeded
258
+ */
259
+ bool initialize(const Config& config);
260
+
261
+ /**
262
+ * Check if the manager is initialized.
263
+ */
264
+ bool is_initialized() const { return initialized_; }
265
+
266
+ /**
267
+ * Release all GPU memory.
268
+ */
269
+ void release();
270
+
271
+ // Input buffer management (pinned host + device)
272
+ PinnedBuffer<uint8_t>& get_pinned_input() { return pinned_input_; }
273
+ DeviceBuffer<uint8_t>& get_device_input() { return device_input_; }
274
+
275
+ /**
276
+ * Upload input data to GPU.
277
+ * @param data Input byte data
278
+ * @param length Length of input
279
+ * @param stream CUDA stream for async transfer (nullptr for sync)
280
+ */
281
+ void upload_input(const uint8_t* data, size_t length, cudaStream_t stream = nullptr);
282
+
283
+ // Prefix trie (shared memory compatible)
284
+ DeviceBuffer<DevicePrefixTrieNode>& get_prefix_trie() { return prefix_trie_; }
285
+
286
+ /**
287
+ * Initialize prefix trie on device.
288
+ * @param initial_nodes Initial nodes to upload (optional)
289
+ */
290
+ void init_prefix_trie(const std::vector<DevicePrefixTrieNode>* initial_nodes = nullptr);
291
+
292
+ // Candidate queue (global memory)
293
+ DeviceBuffer<DeviceCandidate>& get_candidate_queue() { return candidate_queue_; }
294
+ DeviceBuffer<uint32_t>& get_candidate_count() { return candidate_count_; }
295
+ DeviceBuffer<float>& get_min_score() { return min_score_; }
296
+
297
+ /**
298
+ * Reset candidate queue for new extraction.
299
+ */
300
+ void reset_candidate_queue();
301
+
302
+ /**
303
+ * Download candidates from GPU.
304
+ * @param candidates Output vector for candidates
305
+ * @return Number of candidates downloaded
306
+ */
307
+ size_t download_candidates(std::vector<DeviceCandidate>& candidates);
308
+
309
+ // Work queue for path generation
310
+ DeviceBuffer<uint32_t>& get_work_queue() { return work_queue_; }
311
+ DeviceBuffer<uint32_t>& get_work_queue_head() { return work_queue_head_; }
312
+ DeviceBuffer<uint32_t>& get_work_queue_tail() { return work_queue_tail_; }
313
+
314
+ // Histogram buffer for entropy calculation
315
+ DeviceBuffer<uint32_t>& get_histogram() { return histogram_; }
316
+
317
+ // CUDA streams
318
+ cudaStream_t get_stream(int index) const;
319
+ int num_streams() const { return static_cast<int>(streams_.size()); }
320
+
321
+ /**
322
+ * Synchronize all streams.
323
+ */
324
+ void synchronize_all();
325
+
326
+ /**
327
+ * Get memory usage statistics.
328
+ */
329
+ struct MemoryStats {
330
+ size_t total_allocated;
331
+ size_t input_buffer_size;
332
+ size_t trie_size;
333
+ size_t candidate_queue_size;
334
+ size_t work_queue_size;
335
+ size_t other_size;
336
+ };
337
+ MemoryStats get_memory_stats() const;
338
+
339
+ /**
340
+ * Get the current configuration.
341
+ */
342
+ const Config& get_config() const { return config_; }
343
+
344
+ private:
345
+ Config config_;
346
+ bool initialized_;
347
+
348
+ // Pinned host memory for fast transfers
349
+ PinnedBuffer<uint8_t> pinned_input_;
350
+
351
+ // Device memory buffers
352
+ DeviceBuffer<uint8_t> device_input_;
353
+ DeviceBuffer<DevicePrefixTrieNode> prefix_trie_;
354
+ DeviceBuffer<DeviceCandidate> candidate_queue_;
355
+ DeviceBuffer<uint32_t> candidate_count_;
356
+ DeviceBuffer<float> min_score_;
357
+ DeviceBuffer<uint32_t> work_queue_;
358
+ DeviceBuffer<uint32_t> work_queue_head_;
359
+ DeviceBuffer<uint32_t> work_queue_tail_;
360
+ DeviceBuffer<uint32_t> histogram_;
361
+
362
+ // CUDA streams
363
+ std::vector<cudaStream_t> streams_;
364
+
365
+ void allocate_buffers();
366
+ void create_streams();
367
+ void destroy_streams();
368
+ };
369
+
370
+ /**
371
+ * Constant memory manager for signature dictionary.
372
+ * Signatures are stored in constant memory for fast broadcast reads.
373
+ *
374
+ * Requirements: 9.4
375
+ */
376
+ class SignatureConstantMemory {
377
+ public:
378
+ /**
379
+ * Upload signatures to constant memory.
380
+ * @param signatures Vector of device signatures
381
+ * @return true if upload succeeded
382
+ */
383
+ static bool upload_signatures(const std::vector<DeviceFileSignature>& signatures);
384
+
385
+ /**
386
+ * Upload footer signatures to constant memory.
387
+ * @param footers Vector of footer signatures
388
+ * @return true if upload succeeded
389
+ */
390
+ static bool upload_footers(const std::vector<DeviceFooterSignature>& footers);
391
+
392
+ /**
393
+ * Get the number of signatures in constant memory.
394
+ */
395
+ static uint32_t get_signature_count();
396
+
397
+ /**
398
+ * Clear constant memory signatures.
399
+ */
400
+ static void clear();
401
+ };
402
+
403
+ } // namespace cuda
404
+ } // namespace etb
405
+
406
+ #endif // ETB_GPU_MEMORY_CUH
@@ -0,0 +1,315 @@
1
+ #ifndef ETB_HEURISTICS_KERNEL_CUH
2
+ #define ETB_HEURISTICS_KERNEL_CUH
3
+
4
+ #include "cuda_common.cuh"
5
+ #include "gpu_memory.cuh"
6
+
7
+ namespace etb {
8
+ namespace cuda {
9
+
10
+ /**
11
+ * Shared memory structure for heuristics calculation.
12
+ * Uses shared memory histogram for efficient entropy calculation.
13
+ */
14
+ struct HeuristicsSharedMem {
15
+ // Histogram for byte frequency (256 bins)
16
+ uint32_t histogram[256];
17
+
18
+ // Reduction scratch space
19
+ float reduction_scratch[256];
20
+
21
+ // Partial results
22
+ uint32_t printable_count;
23
+ uint32_t control_count;
24
+ uint32_t null_run_max;
25
+ uint32_t current_null_run;
26
+ uint32_t utf8_valid_count;
27
+ uint32_t utf8_total_count;
28
+ };
29
+
30
+ /**
31
+ * OPTIMIZED Heuristics CUDA kernel.
32
+ *
33
+ * Calculates Shannon entropy, byte distribution, and other heuristics
34
+ * using warp shuffle reductions and parallel UTF-8 validation.
35
+ *
36
+ * Optimizations applied:
37
+ * - Warp shuffle reductions instead of shared memory atomics for entropy
38
+ * - Parallel UTF-8 validation using chunked heuristic approach
39
+ * - Better memory coalescing for histogram building
40
+ * - Reduced thread divergence in character classification
41
+ *
42
+ * Requirements: 9.3
43
+ *
44
+ * @param data Input byte data
45
+ * @param length Length of data
46
+ * @param weights Heuristic weights
47
+ * @param result Output heuristic result
48
+ */
49
+ __global__ void heuristics_kernel(
50
+ const uint8_t* data,
51
+ uint32_t length,
52
+ DeviceHeuristicWeights weights,
53
+ DeviceHeuristicResult* result
54
+ );
55
+
56
+ /**
57
+ * Batch heuristics kernel for evaluating multiple byte sequences.
58
+ *
59
+ * @param data_ptrs Array of pointers to byte sequences
60
+ * @param lengths Array of sequence lengths
61
+ * @param num_sequences Number of sequences to evaluate
62
+ * @param weights Heuristic weights
63
+ * @param results Output array of heuristic results
64
+ */
65
+ __global__ void batch_heuristics_kernel(
66
+ const uint8_t** data_ptrs,
67
+ const uint32_t* lengths,
68
+ uint32_t num_sequences,
69
+ DeviceHeuristicWeights weights,
70
+ DeviceHeuristicResult* results
71
+ );
72
+
73
+ /**
74
+ * Inline heuristics evaluation for use within other kernels.
75
+ * Uses warp-level operations for efficiency.
76
+ *
77
+ * @param data Byte data (in registers or shared memory)
78
+ * @param length Length of data (max 32 for inline evaluation)
79
+ * @param weights Heuristic weights
80
+ * @return Heuristic result
81
+ */
82
+ __device__ DeviceHeuristicResult evaluate_heuristics_inline(
83
+ const uint8_t* data,
84
+ uint32_t length,
85
+ const DeviceHeuristicWeights& weights
86
+ );
87
+
88
+ /**
89
+ * Host-side launcher for heuristics kernel.
90
+ */
91
+ class HeuristicsKernel {
92
+ public:
93
+ HeuristicsKernel();
94
+ ~HeuristicsKernel();
95
+
96
+ /**
97
+ * Configure the kernel for a specific device.
98
+ * @param device_id CUDA device ID
99
+ */
100
+ void configure(int device_id);
101
+
102
+ /**
103
+ * Evaluate heuristics for a single byte sequence.
104
+ * @param data Device pointer to byte data
105
+ * @param length Length of data
106
+ * @param weights Heuristic weights
107
+ * @param result Device pointer to result
108
+ * @param stream CUDA stream
109
+ */
110
+ void evaluate(const uint8_t* data, uint32_t length,
111
+ const DeviceHeuristicWeights& weights,
112
+ DeviceHeuristicResult* result,
113
+ cudaStream_t stream = nullptr);
114
+
115
+ /**
116
+ * Evaluate heuristics for multiple byte sequences.
117
+ * @param data_ptrs Device array of pointers to byte sequences
118
+ * @param lengths Device array of sequence lengths
119
+ * @param num_sequences Number of sequences
120
+ * @param weights Heuristic weights
121
+ * @param results Device array of results
122
+ * @param stream CUDA stream
123
+ */
124
+ void evaluate_batch(const uint8_t** data_ptrs, const uint32_t* lengths,
125
+ uint32_t num_sequences,
126
+ const DeviceHeuristicWeights& weights,
127
+ DeviceHeuristicResult* results,
128
+ cudaStream_t stream = nullptr);
129
+
130
+ /**
131
+ * Get the kernel configuration.
132
+ */
133
+ const KernelConfig& get_config() const { return kernel_config_; }
134
+
135
+ private:
136
+ KernelConfig kernel_config_;
137
+ bool configured_;
138
+ };
139
+
140
+ // ============================================================================
141
+ // Device Functions
142
+ // ============================================================================
143
+
144
+ /**
145
+ * Check if a byte is printable ASCII (0x20-0x7E).
146
+ */
147
+ __device__ inline bool is_printable_ascii(uint8_t byte) {
148
+ return byte >= 0x20 && byte <= 0x7E;
149
+ }
150
+
151
+ /**
152
+ * Check if a byte is a control character (0x00-0x1F, excluding 0x09, 0x0A, 0x0D).
153
+ */
154
+ __device__ inline bool is_control_char(uint8_t byte) {
155
+ if (byte > 0x1F) return false;
156
+ if (byte == 0x09 || byte == 0x0A || byte == 0x0D) return false; // Tab, LF, CR
157
+ return true;
158
+ }
159
+
160
+ /**
161
+ * Calculate entropy contribution for a single bin.
162
+ * Returns -p * log2(p) where p = count / total.
163
+ */
164
+ __device__ inline float entropy_contribution(uint32_t count, uint32_t total) {
165
+ if (count == 0 || total == 0) return 0.0f;
166
+ float p = static_cast<float>(count) / static_cast<float>(total);
167
+ return -p * log2f(p);
168
+ }
169
+
170
+ /**
171
+ * Warp-level histogram update using atomics.
172
+ */
173
+ __device__ inline void warp_histogram_add(uint32_t* histogram, uint8_t value) {
174
+ atomicAdd(&histogram[value], 1);
175
+ }
176
+
177
+ /**
178
+ * Warp-level reduction for summing histogram entropy.
179
+ */
180
+ __device__ inline float warp_reduce_entropy(float* scratch, uint32_t* histogram,
181
+ uint32_t total, int lane_id) {
182
+ // Each lane handles 8 histogram bins (256 / 32 = 8)
183
+ float local_entropy = 0.0f;
184
+ for (int i = 0; i < 8; ++i) {
185
+ int bin = lane_id * 8 + i;
186
+ local_entropy += entropy_contribution(histogram[bin], total);
187
+ }
188
+
189
+ // Warp reduction
190
+ for (int offset = 16; offset > 0; offset /= 2) {
191
+ local_entropy += __shfl_down_sync(0xFFFFFFFF, local_entropy, offset);
192
+ }
193
+
194
+ return __shfl_sync(0xFFFFFFFF, local_entropy, 0);
195
+ }
196
+
197
+ /**
198
+ * Block-level histogram reduction.
199
+ */
200
+ __device__ inline float block_reduce_entropy(float* scratch, uint32_t* histogram,
201
+ uint32_t total, int tid, int block_size) {
202
+ // First, each thread handles some bins
203
+ int bins_per_thread = (256 + block_size - 1) / block_size;
204
+ float local_entropy = 0.0f;
205
+
206
+ for (int i = 0; i < bins_per_thread; ++i) {
207
+ int bin = tid * bins_per_thread + i;
208
+ if (bin < 256) {
209
+ local_entropy += entropy_contribution(histogram[bin], total);
210
+ }
211
+ }
212
+
213
+ // Store to shared memory
214
+ scratch[tid] = local_entropy;
215
+ __syncthreads();
216
+
217
+ // Tree reduction
218
+ for (int stride = block_size / 2; stride > 0; stride /= 2) {
219
+ if (tid < stride) {
220
+ scratch[tid] += scratch[tid + stride];
221
+ }
222
+ __syncthreads();
223
+ }
224
+
225
+ return scratch[0];
226
+ }
227
+
228
+ /**
229
+ * Validate UTF-8 byte sequence.
230
+ * Returns the number of valid UTF-8 code points found.
231
+ */
232
+ __device__ inline void validate_utf8_byte(uint8_t byte, int& state,
233
+ uint32_t& valid_count, uint32_t& total_count) {
234
+ // UTF-8 state machine
235
+ // state: 0 = expecting start byte, 1-3 = expecting continuation bytes
236
+
237
+ if (state == 0) {
238
+ total_count++;
239
+ if ((byte & 0x80) == 0) {
240
+ // ASCII (0xxxxxxx)
241
+ valid_count++;
242
+ } else if ((byte & 0xE0) == 0xC0) {
243
+ // 2-byte sequence start (110xxxxx)
244
+ state = 1;
245
+ } else if ((byte & 0xF0) == 0xE0) {
246
+ // 3-byte sequence start (1110xxxx)
247
+ state = 2;
248
+ } else if ((byte & 0xF8) == 0xF0) {
249
+ // 4-byte sequence start (11110xxx)
250
+ state = 3;
251
+ }
252
+ // Invalid start byte - don't increment valid_count
253
+ } else {
254
+ // Expecting continuation byte (10xxxxxx)
255
+ if ((byte & 0xC0) == 0x80) {
256
+ state--;
257
+ if (state == 0) {
258
+ valid_count++; // Complete valid sequence
259
+ }
260
+ } else {
261
+ // Invalid continuation - reset state
262
+ state = 0;
263
+ total_count++; // Count this as a new character
264
+ }
265
+ }
266
+ }
267
+
268
+ /**
269
+ * Calculate composite heuristic score.
270
+ */
271
+ __device__ inline float calculate_composite_score(
272
+ const DeviceHeuristicResult& result,
273
+ const DeviceHeuristicWeights& weights,
274
+ uint32_t length
275
+ ) {
276
+ // Normalize entropy to [0, 1] range (max entropy is 8.0)
277
+ float entropy_score = result.entropy / 8.0f;
278
+
279
+ // Entropy penalty for very high or very low values
280
+ // Ideal range is roughly 3.5-7.0 for most valid data
281
+ if (result.entropy < 0.5f || result.entropy > 7.8f) {
282
+ entropy_score *= 0.5f; // Penalize extreme values
283
+ }
284
+
285
+ // Printable ratio is already [0, 1]
286
+ float printable_score = result.printable_ratio;
287
+
288
+ // Control char penalty (invert - fewer is better)
289
+ float control_score = 1.0f - result.control_char_ratio;
290
+
291
+ // Null run penalty
292
+ float null_penalty = 1.0f;
293
+ if (length > 0) {
294
+ float null_ratio = static_cast<float>(result.max_null_run) / static_cast<float>(length);
295
+ null_penalty = 1.0f - fminf(null_ratio * 2.0f, 1.0f); // Penalize long null runs
296
+ }
297
+
298
+ // UTF-8 validity is already [0, 1]
299
+ float utf8_score = result.utf8_validity;
300
+
301
+ // Weighted combination
302
+ float composite =
303
+ weights.entropy_weight * entropy_score +
304
+ weights.printable_weight * printable_score +
305
+ weights.control_char_weight * control_score +
306
+ weights.null_run_weight * null_penalty +
307
+ weights.utf8_weight * utf8_score;
308
+
309
+ return fminf(fmaxf(composite, 0.0f), 1.0f);
310
+ }
311
+
312
+ } // namespace cuda
313
+ } // namespace etb
314
+
315
+ #endif // ETB_HEURISTICS_KERNEL_CUH