PyPI - explodethosebits - Versions diffs - 0.3.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - Mend

explodethosebits 0.3.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

etb/__init__.py +351 -0
etb/__init__.pyi +976 -0
etb/_etb.cpython-39-x86_64-linux-gnu.so +0 -0
etb/_version.py +34 -0
etb/py.typed +2 -0
explodethosebits-0.3.0.dist-info/METADATA +405 -0
explodethosebits-0.3.0.dist-info/RECORD +88 -0
explodethosebits-0.3.0.dist-info/WHEEL +6 -0
explodethosebits-0.3.0.dist-info/licenses/LICENSE +21 -0
explodethosebits-0.3.0.dist-info/sboms/auditwheel.cdx.json +1 -0
explodethosebits.libs/libcudart-c3a75b33.so.12.8.90 +0 -0
include/etb/bit_coordinate.hpp +45 -0
include/etb/bit_extraction.hpp +79 -0
include/etb/bit_pruning.hpp +122 -0
include/etb/config.hpp +284 -0
include/etb/cuda/arch_optimizations.cuh +358 -0
include/etb/cuda/blackwell_optimizations.cuh +300 -0
include/etb/cuda/cuda_common.cuh +265 -0
include/etb/cuda/etb_cuda.cuh +200 -0
include/etb/cuda/gpu_memory.cuh +406 -0
include/etb/cuda/heuristics_kernel.cuh +315 -0
include/etb/cuda/path_generator_kernel.cuh +272 -0
include/etb/cuda/prefix_pruner_kernel.cuh +370 -0
include/etb/cuda/signature_kernel.cuh +328 -0
include/etb/early_stopping.hpp +246 -0
include/etb/etb.hpp +20 -0
include/etb/heuristics.hpp +165 -0
include/etb/memoization.hpp +285 -0
include/etb/path.hpp +86 -0
include/etb/path_count.hpp +87 -0
include/etb/path_generator.hpp +175 -0
include/etb/prefix_trie.hpp +339 -0
include/etb/reporting.hpp +437 -0
include/etb/scoring.hpp +269 -0
include/etb/signature.hpp +190 -0
include/gmock/gmock-actions.h +2297 -0
include/gmock/gmock-cardinalities.h +159 -0
include/gmock/gmock-function-mocker.h +518 -0
include/gmock/gmock-matchers.h +5623 -0
include/gmock/gmock-more-actions.h +658 -0
include/gmock/gmock-more-matchers.h +120 -0
include/gmock/gmock-nice-strict.h +277 -0
include/gmock/gmock-spec-builders.h +2148 -0
include/gmock/gmock.h +96 -0
include/gmock/internal/custom/README.md +18 -0
include/gmock/internal/custom/gmock-generated-actions.h +7 -0
include/gmock/internal/custom/gmock-matchers.h +37 -0
include/gmock/internal/custom/gmock-port.h +40 -0
include/gmock/internal/gmock-internal-utils.h +487 -0
include/gmock/internal/gmock-port.h +139 -0
include/gmock/internal/gmock-pp.h +279 -0
include/gtest/gtest-assertion-result.h +237 -0
include/gtest/gtest-death-test.h +345 -0
include/gtest/gtest-matchers.h +923 -0
include/gtest/gtest-message.h +252 -0
include/gtest/gtest-param-test.h +546 -0
include/gtest/gtest-printers.h +1161 -0
include/gtest/gtest-spi.h +250 -0
include/gtest/gtest-test-part.h +192 -0
include/gtest/gtest-typed-test.h +331 -0
include/gtest/gtest.h +2321 -0
include/gtest/gtest_pred_impl.h +279 -0
include/gtest/gtest_prod.h +60 -0
include/gtest/internal/custom/README.md +44 -0
include/gtest/internal/custom/gtest-port.h +37 -0
include/gtest/internal/custom/gtest-printers.h +42 -0
include/gtest/internal/custom/gtest.h +37 -0
include/gtest/internal/gtest-death-test-internal.h +307 -0
include/gtest/internal/gtest-filepath.h +227 -0
include/gtest/internal/gtest-internal.h +1560 -0
include/gtest/internal/gtest-param-util.h +1026 -0
include/gtest/internal/gtest-port-arch.h +122 -0
include/gtest/internal/gtest-port.h +2481 -0
include/gtest/internal/gtest-string.h +178 -0
include/gtest/internal/gtest-type-util.h +220 -0
lib/libetb_core.a +0 -0
lib64/cmake/GTest/GTestConfig.cmake +33 -0
lib64/cmake/GTest/GTestConfigVersion.cmake +43 -0
lib64/cmake/GTest/GTestTargets-release.cmake +49 -0
lib64/cmake/GTest/GTestTargets.cmake +139 -0
lib64/libgmock.a +0 -0
lib64/libgmock_main.a +0 -0
lib64/libgtest.a +0 -0
lib64/libgtest_main.a +0 -0
lib64/pkgconfig/gmock.pc +10 -0
lib64/pkgconfig/gmock_main.pc +10 -0
lib64/pkgconfig/gtest.pc +9 -0
lib64/pkgconfig/gtest_main.pc +10 -0

include/etb/cuda/gpu_memory.cuh ADDED Viewed

@@ -0,0 +1,406 @@
+#ifndef ETB_GPU_MEMORY_CUH
+#define ETB_GPU_MEMORY_CUH
+#include "cuda_common.cuh"
+#include <cstring>
+#include <memory>
+#include <vector>
+namespace etb {
+namespace cuda {
+/**
+ * RAII wrapper for pinned (page-locked) host memory.
+ * Provides faster host-to-device transfers.
+ */
+template<typename T>
+class PinnedBuffer {
+public:
+    PinnedBuffer() : data_(nullptr), size_(0), capacity_(0) {}
+    explicit PinnedBuffer(size_t count) : data_(nullptr), size_(0), capacity_(0) {
+        allocate(count);
+    }
+    ~PinnedBuffer() {
+        free();
+    }
+    // Non-copyable
+    PinnedBuffer(const PinnedBuffer&) = delete;
+    PinnedBuffer& operator=(const PinnedBuffer&) = delete;
+    // Movable
+    PinnedBuffer(PinnedBuffer&& other) noexcept
+        : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
+        other.data_ = nullptr;
+        other.size_ = 0;
+        other.capacity_ = 0;
+    }
+    PinnedBuffer& operator=(PinnedBuffer&& other) noexcept {
+        if (this != &other) {
+            free();
+            data_ = other.data_;
+            size_ = other.size_;
+            capacity_ = other.capacity_;
+            other.data_ = nullptr;
+            other.size_ = 0;
+            other.capacity_ = 0;
+        }
+        return *this;
+    }
+    void allocate(size_t count) {
+        if (count > capacity_) {
+            free();
+            ETB_CUDA_CHECK(cudaMallocHost(&data_, count * sizeof(T)));
+            capacity_ = count;
+        }
+        size_ = count;
+    }
+    void free() {
+        if (data_) {
+            cudaFreeHost(data_);
+            data_ = nullptr;
+            size_ = 0;
+            capacity_ = 0;
+        }
+    }
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+    size_t size() const { return size_; }
+    size_t capacity() const { return capacity_; }
+    size_t bytes() const { return size_ * sizeof(T); }
+    bool empty() const { return size_ == 0; }
+    T& operator[](size_t index) { return data_[index]; }
+    const T& operator[](size_t index) const { return data_[index]; }
+    // Copy from host vector
+    void copy_from(const std::vector<T>& src) {
+        allocate(src.size());
+        std::memcpy(data_, src.data(), src.size() * sizeof(T));
+    }
+    // Copy to host vector
+    std::vector<T> to_vector() const {
+        return std::vector<T>(data_, data_ + size_);
+    }
+private:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+};
+/**
+ * RAII wrapper for device (GPU) memory.
+ */
+template<typename T>
+class DeviceBuffer {
+public:
+    DeviceBuffer() : data_(nullptr), size_(0), capacity_(0) {}
+    explicit DeviceBuffer(size_t count) : data_(nullptr), size_(0), capacity_(0) {
+        allocate(count);
+    }
+    ~DeviceBuffer() {
+        free();
+    }
+    // Non-copyable
+    DeviceBuffer(const DeviceBuffer&) = delete;
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+    // Movable
+    DeviceBuffer(DeviceBuffer&& other) noexcept
+        : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
+        other.data_ = nullptr;
+        other.size_ = 0;
+        other.capacity_ = 0;
+    }
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
+        if (this != &other) {
+            free();
+            data_ = other.data_;
+            size_ = other.size_;
+            capacity_ = other.capacity_;
+            other.data_ = nullptr;
+            other.size_ = 0;
+            other.capacity_ = 0;
+        }
+        return *this;
+    }
+    void allocate(size_t count) {
+        if (count > capacity_) {
+            free();
+            ETB_CUDA_CHECK(cudaMalloc(&data_, count * sizeof(T)));
+            capacity_ = count;
+        }
+        size_ = count;
+    }
+    void free() {
+        if (data_) {
+            cudaFree(data_);
+            data_ = nullptr;
+            size_ = 0;
+            capacity_ = 0;
+        }
+    }
+    void clear() {
+        if (data_ && size_ > 0) {
+            ETB_CUDA_CHECK(cudaMemset(data_, 0, size_ * sizeof(T)));
+        }
+    }
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+    size_t size() const { return size_; }
+    size_t capacity() const { return capacity_; }
+    size_t bytes() const { return size_ * sizeof(T); }
+    bool empty() const { return size_ == 0; }
+    // Copy from host
+    void copy_from_host(const T* src, size_t count) {
+        allocate(count);
+        ETB_CUDA_CHECK(cudaMemcpy(data_, src, count * sizeof(T), cudaMemcpyHostToDevice));
+    }
+    void copy_from_host(const std::vector<T>& src) {
+        copy_from_host(src.data(), src.size());
+    }
+    void copy_from_host(const PinnedBuffer<T>& src) {
+        copy_from_host(src.data(), src.size());
+    }
+    // Async copy from pinned host memory
+    void copy_from_host_async(const PinnedBuffer<T>& src, cudaStream_t stream) {
+        allocate(src.size());
+        ETB_CUDA_CHECK(cudaMemcpyAsync(data_, src.data(), src.bytes(),
+                                        cudaMemcpyHostToDevice, stream));
+    }
+    // Copy to host
+    void copy_to_host(T* dst, size_t count) const {
+        ETB_CUDA_CHECK(cudaMemcpy(dst, data_, count * sizeof(T), cudaMemcpyDeviceToHost));
+    }
+    void copy_to_host(std::vector<T>& dst) const {
+        dst.resize(size_);
+        copy_to_host(dst.data(), size_);
+    }
+    void copy_to_host(PinnedBuffer<T>& dst) const {
+        dst.allocate(size_);
+        copy_to_host(dst.data(), size_);
+    }
+    // Async copy to pinned host memory
+    void copy_to_host_async(PinnedBuffer<T>& dst, cudaStream_t stream) const {
+        dst.allocate(size_);
+        ETB_CUDA_CHECK(cudaMemcpyAsync(dst.data(), data_, bytes(),
+                                        cudaMemcpyDeviceToHost, stream));
+    }
+private:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+};
+/**
+ * GPU memory manager for the ETB library.
+ * Handles allocation and management of all GPU memory resources.
+ *
+ * Requirements: 9.4, 9.5, 9.6
+ */
+class GPUMemoryManager {
+public:
+    /**
+     * Configuration for GPU memory allocation.
+     */
+    struct Config {
+        size_t max_input_size;          // Maximum input buffer size
+        size_t prefix_trie_capacity;    // Number of trie nodes
+        size_t candidate_queue_capacity; // Number of candidates to track
+        size_t work_queue_capacity;     // Work items for path generation
+        int num_streams;                // Number of CUDA streams
+        Config()
+            : max_input_size(1024 * 1024)  // 1MB default
+            , prefix_trie_capacity(65536)
+            , candidate_queue_capacity(1024)
+            , work_queue_capacity(65536)
+            , num_streams(4) {}
+    };
+    GPUMemoryManager();
+    explicit GPUMemoryManager(const Config& config);
+    ~GPUMemoryManager();
+    // Non-copyable, non-movable
+    GPUMemoryManager(const GPUMemoryManager&) = delete;
+    GPUMemoryManager& operator=(const GPUMemoryManager&) = delete;
+    /**
+     * Initialize GPU memory with the given configuration.
+     * @param config Memory configuration
+     * @return true if initialization succeeded
+     */
+    bool initialize(const Config& config);
+    /**
+     * Check if the manager is initialized.
+     */
+    bool is_initialized() const { return initialized_; }
+    /**
+     * Release all GPU memory.
+     */
+    void release();
+    // Input buffer management (pinned host + device)
+    PinnedBuffer<uint8_t>& get_pinned_input() { return pinned_input_; }
+    DeviceBuffer<uint8_t>& get_device_input() { return device_input_; }
+    /**
+     * Upload input data to GPU.
+     * @param data Input byte data
+     * @param length Length of input
+     * @param stream CUDA stream for async transfer (nullptr for sync)
+     */
+    void upload_input(const uint8_t* data, size_t length, cudaStream_t stream = nullptr);
+    // Prefix trie (shared memory compatible)
+    DeviceBuffer<DevicePrefixTrieNode>& get_prefix_trie() { return prefix_trie_; }
+    /**
+     * Initialize prefix trie on device.
+     * @param initial_nodes Initial nodes to upload (optional)
+     */
+    void init_prefix_trie(const std::vector<DevicePrefixTrieNode>* initial_nodes = nullptr);
+    // Candidate queue (global memory)
+    DeviceBuffer<DeviceCandidate>& get_candidate_queue() { return candidate_queue_; }
+    DeviceBuffer<uint32_t>& get_candidate_count() { return candidate_count_; }
+    DeviceBuffer<float>& get_min_score() { return min_score_; }
+    /**
+     * Reset candidate queue for new extraction.
+     */
+    void reset_candidate_queue();
+    /**
+     * Download candidates from GPU.
+     * @param candidates Output vector for candidates
+     * @return Number of candidates downloaded
+     */
+    size_t download_candidates(std::vector<DeviceCandidate>& candidates);
+    // Work queue for path generation
+    DeviceBuffer<uint32_t>& get_work_queue() { return work_queue_; }
+    DeviceBuffer<uint32_t>& get_work_queue_head() { return work_queue_head_; }
+    DeviceBuffer<uint32_t>& get_work_queue_tail() { return work_queue_tail_; }
+    // Histogram buffer for entropy calculation
+    DeviceBuffer<uint32_t>& get_histogram() { return histogram_; }
+    // CUDA streams
+    cudaStream_t get_stream(int index) const;
+    int num_streams() const { return static_cast<int>(streams_.size()); }
+    /**
+     * Synchronize all streams.
+     */
+    void synchronize_all();
+    /**
+     * Get memory usage statistics.
+     */
+    struct MemoryStats {
+        size_t total_allocated;
+        size_t input_buffer_size;
+        size_t trie_size;
+        size_t candidate_queue_size;
+        size_t work_queue_size;
+        size_t other_size;
+    };
+    MemoryStats get_memory_stats() const;
+    /**
+     * Get the current configuration.
+     */
+    const Config& get_config() const { return config_; }
+private:
+    Config config_;
+    bool initialized_;
+    // Pinned host memory for fast transfers
+    PinnedBuffer<uint8_t> pinned_input_;
+    // Device memory buffers
+    DeviceBuffer<uint8_t> device_input_;
+    DeviceBuffer<DevicePrefixTrieNode> prefix_trie_;
+    DeviceBuffer<DeviceCandidate> candidate_queue_;
+    DeviceBuffer<uint32_t> candidate_count_;
+    DeviceBuffer<float> min_score_;
+    DeviceBuffer<uint32_t> work_queue_;
+    DeviceBuffer<uint32_t> work_queue_head_;
+    DeviceBuffer<uint32_t> work_queue_tail_;
+    DeviceBuffer<uint32_t> histogram_;
+    // CUDA streams
+    std::vector<cudaStream_t> streams_;
+    void allocate_buffers();
+    void create_streams();
+    void destroy_streams();
+};
+/**
+ * Constant memory manager for signature dictionary.
+ * Signatures are stored in constant memory for fast broadcast reads.
+ *
+ * Requirements: 9.4
+ */
+class SignatureConstantMemory {
+public:
+    /**
+     * Upload signatures to constant memory.
+     * @param signatures Vector of device signatures
+     * @return true if upload succeeded
+     */
+    static bool upload_signatures(const std::vector<DeviceFileSignature>& signatures);
+    /**
+     * Upload footer signatures to constant memory.
+     * @param footers Vector of footer signatures
+     * @return true if upload succeeded
+     */
+    static bool upload_footers(const std::vector<DeviceFooterSignature>& footers);
+    /**
+     * Get the number of signatures in constant memory.
+     */
+    static uint32_t get_signature_count();
+    /**
+     * Clear constant memory signatures.
+     */
+    static void clear();
+};
+} // namespace cuda
+} // namespace etb
+#endif // ETB_GPU_MEMORY_CUH

include/etb/cuda/heuristics_kernel.cuh ADDED Viewed

@@ -0,0 +1,315 @@
+#ifndef ETB_HEURISTICS_KERNEL_CUH
+#define ETB_HEURISTICS_KERNEL_CUH
+#include "cuda_common.cuh"
+#include "gpu_memory.cuh"
+namespace etb {
+namespace cuda {
+/**
+ * Shared memory structure for heuristics calculation.
+ * Uses shared memory histogram for efficient entropy calculation.
+ */
+struct HeuristicsSharedMem {
+    // Histogram for byte frequency (256 bins)
+    uint32_t histogram[256];
+    // Reduction scratch space
+    float reduction_scratch[256];
+    // Partial results
+    uint32_t printable_count;
+    uint32_t control_count;
+    uint32_t null_run_max;
+    uint32_t current_null_run;
+    uint32_t utf8_valid_count;
+    uint32_t utf8_total_count;
+};
+/**
+ * OPTIMIZED Heuristics CUDA kernel.
+ *
+ * Calculates Shannon entropy, byte distribution, and other heuristics
+ * using warp shuffle reductions and parallel UTF-8 validation.
+ *
+ * Optimizations applied:
+ * - Warp shuffle reductions instead of shared memory atomics for entropy
+ * - Parallel UTF-8 validation using chunked heuristic approach
+ * - Better memory coalescing for histogram building
+ * - Reduced thread divergence in character classification
+ *
+ * Requirements: 9.3
+ *
+ * @param data Input byte data
+ * @param length Length of data
+ * @param weights Heuristic weights
+ * @param result Output heuristic result
+ */
+__global__ void heuristics_kernel(
+    const uint8_t* data,
+    uint32_t length,
+    DeviceHeuristicWeights weights,
+    DeviceHeuristicResult* result
+);
+/**
+ * Batch heuristics kernel for evaluating multiple byte sequences.
+ *
+ * @param data_ptrs Array of pointers to byte sequences
+ * @param lengths Array of sequence lengths
+ * @param num_sequences Number of sequences to evaluate
+ * @param weights Heuristic weights
+ * @param results Output array of heuristic results
+ */
+__global__ void batch_heuristics_kernel(
+    const uint8_t** data_ptrs,
+    const uint32_t* lengths,
+    uint32_t num_sequences,
+    DeviceHeuristicWeights weights,
+    DeviceHeuristicResult* results
+);
+/**
+ * Inline heuristics evaluation for use within other kernels.
+ * Uses warp-level operations for efficiency.
+ *
+ * @param data Byte data (in registers or shared memory)
+ * @param length Length of data (max 32 for inline evaluation)
+ * @param weights Heuristic weights
+ * @return Heuristic result
+ */
+__device__ DeviceHeuristicResult evaluate_heuristics_inline(
+    const uint8_t* data,
+    uint32_t length,
+    const DeviceHeuristicWeights& weights
+);
+/**
+ * Host-side launcher for heuristics kernel.
+ */
+class HeuristicsKernel {
+public:
+    HeuristicsKernel();
+    ~HeuristicsKernel();
+    /**
+     * Configure the kernel for a specific device.
+     * @param device_id CUDA device ID
+     */
+    void configure(int device_id);
+    /**
+     * Evaluate heuristics for a single byte sequence.
+     * @param data Device pointer to byte data
+     * @param length Length of data
+     * @param weights Heuristic weights
+     * @param result Device pointer to result
+     * @param stream CUDA stream
+     */
+    void evaluate(const uint8_t* data, uint32_t length,
+                  const DeviceHeuristicWeights& weights,
+                  DeviceHeuristicResult* result,
+                  cudaStream_t stream = nullptr);
+    /**
+     * Evaluate heuristics for multiple byte sequences.
+     * @param data_ptrs Device array of pointers to byte sequences
+     * @param lengths Device array of sequence lengths
+     * @param num_sequences Number of sequences
+     * @param weights Heuristic weights
+     * @param results Device array of results
+     * @param stream CUDA stream
+     */
+    void evaluate_batch(const uint8_t** data_ptrs, const uint32_t* lengths,
+                        uint32_t num_sequences,
+                        const DeviceHeuristicWeights& weights,
+                        DeviceHeuristicResult* results,
+                        cudaStream_t stream = nullptr);
+    /**
+     * Get the kernel configuration.
+     */
+    const KernelConfig& get_config() const { return kernel_config_; }
+private:
+    KernelConfig kernel_config_;
+    bool configured_;
+};
+// ============================================================================
+// Device Functions
+// ============================================================================
+/**
+ * Check if a byte is printable ASCII (0x20-0x7E).
+ */
+__device__ inline bool is_printable_ascii(uint8_t byte) {
+    return byte >= 0x20 && byte <= 0x7E;
+}
+/**
+ * Check if a byte is a control character (0x00-0x1F, excluding 0x09, 0x0A, 0x0D).
+ */
+__device__ inline bool is_control_char(uint8_t byte) {
+    if (byte > 0x1F) return false;
+    if (byte == 0x09 || byte == 0x0A || byte == 0x0D) return false;  // Tab, LF, CR
+    return true;
+}
+/**
+ * Calculate entropy contribution for a single bin.
+ * Returns -p * log2(p) where p = count / total.
+ */
+__device__ inline float entropy_contribution(uint32_t count, uint32_t total) {
+    if (count == 0 || total == 0) return 0.0f;
+    float p = static_cast<float>(count) / static_cast<float>(total);
+    return -p * log2f(p);
+}
+/**
+ * Warp-level histogram update using atomics.
+ */
+__device__ inline void warp_histogram_add(uint32_t* histogram, uint8_t value) {
+    atomicAdd(&histogram[value], 1);
+}
+/**
+ * Warp-level reduction for summing histogram entropy.
+ */
+__device__ inline float warp_reduce_entropy(float* scratch, uint32_t* histogram,
+                                             uint32_t total, int lane_id) {
+    // Each lane handles 8 histogram bins (256 / 32 = 8)
+    float local_entropy = 0.0f;
+    for (int i = 0; i < 8; ++i) {
+        int bin = lane_id * 8 + i;
+        local_entropy += entropy_contribution(histogram[bin], total);
+    }
+    // Warp reduction
+    for (int offset = 16; offset > 0; offset /= 2) {
+        local_entropy += __shfl_down_sync(0xFFFFFFFF, local_entropy, offset);
+    }
+    return __shfl_sync(0xFFFFFFFF, local_entropy, 0);
+}
+/**
+ * Block-level histogram reduction.
+ */
+__device__ inline float block_reduce_entropy(float* scratch, uint32_t* histogram,
+                                              uint32_t total, int tid, int block_size) {
+    // First, each thread handles some bins
+    int bins_per_thread = (256 + block_size - 1) / block_size;
+    float local_entropy = 0.0f;
+    for (int i = 0; i < bins_per_thread; ++i) {
+        int bin = tid * bins_per_thread + i;
+        if (bin < 256) {
+            local_entropy += entropy_contribution(histogram[bin], total);
+        }
+    }
+    // Store to shared memory
+    scratch[tid] = local_entropy;
+    __syncthreads();
+    // Tree reduction
+    for (int stride = block_size / 2; stride > 0; stride /= 2) {
+        if (tid < stride) {
+            scratch[tid] += scratch[tid + stride];
+        }
+        __syncthreads();
+    }
+    return scratch[0];
+}
+/**
+ * Validate UTF-8 byte sequence.
+ * Returns the number of valid UTF-8 code points found.
+ */
+__device__ inline void validate_utf8_byte(uint8_t byte, int& state,
+                                           uint32_t& valid_count, uint32_t& total_count) {
+    // UTF-8 state machine
+    // state: 0 = expecting start byte, 1-3 = expecting continuation bytes
+    if (state == 0) {
+        total_count++;
+        if ((byte & 0x80) == 0) {
+            // ASCII (0xxxxxxx)
+            valid_count++;
+        } else if ((byte & 0xE0) == 0xC0) {
+            // 2-byte sequence start (110xxxxx)
+            state = 1;
+        } else if ((byte & 0xF0) == 0xE0) {
+            // 3-byte sequence start (1110xxxx)
+            state = 2;
+        } else if ((byte & 0xF8) == 0xF0) {
+            // 4-byte sequence start (11110xxx)
+            state = 3;
+        }
+        // Invalid start byte - don't increment valid_count
+    } else {
+        // Expecting continuation byte (10xxxxxx)
+        if ((byte & 0xC0) == 0x80) {
+            state--;
+            if (state == 0) {
+                valid_count++;  // Complete valid sequence
+            }
+        } else {
+            // Invalid continuation - reset state
+            state = 0;
+            total_count++;  // Count this as a new character
+        }
+    }
+}
+/**
+ * Calculate composite heuristic score.
+ */
+__device__ inline float calculate_composite_score(
+    const DeviceHeuristicResult& result,
+    const DeviceHeuristicWeights& weights,
+    uint32_t length
+) {
+    // Normalize entropy to [0, 1] range (max entropy is 8.0)
+    float entropy_score = result.entropy / 8.0f;
+    // Entropy penalty for very high or very low values
+    // Ideal range is roughly 3.5-7.0 for most valid data
+    if (result.entropy < 0.5f || result.entropy > 7.8f) {
+        entropy_score *= 0.5f;  // Penalize extreme values
+    }
+    // Printable ratio is already [0, 1]
+    float printable_score = result.printable_ratio;
+    // Control char penalty (invert - fewer is better)
+    float control_score = 1.0f - result.control_char_ratio;
+    // Null run penalty
+    float null_penalty = 1.0f;
+    if (length > 0) {
+        float null_ratio = static_cast<float>(result.max_null_run) / static_cast<float>(length);
+        null_penalty = 1.0f - fminf(null_ratio * 2.0f, 1.0f);  // Penalize long null runs
+    }
+    // UTF-8 validity is already [0, 1]
+    float utf8_score = result.utf8_validity;
+    // Weighted combination
+    float composite =
+        weights.entropy_weight * entropy_score +
+        weights.printable_weight * printable_score +
+        weights.control_char_weight * control_score +
+        weights.null_run_weight * null_penalty +
+        weights.utf8_weight * utf8_score;
+    return fminf(fmaxf(composite, 0.0f), 1.0f);
+}
+} // namespace cuda
+} // namespace etb
+#endif // ETB_HEURISTICS_KERNEL_CUH