npm - cactus-react-native - Versions diffs - 1.4.0 → 1.5.0 - Mend

cactus-react-native 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/README.md +212 -27
package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
package/cpp/HybridCactus.cpp +119 -0
package/cpp/HybridCactus.hpp +13 -0
package/cpp/cactus_ffi.h +24 -0
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +24 -0
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +41 -1
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +66 -48
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h +549 -0
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +102 -21
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +45 -195
package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +399 -140
package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +24 -0
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +41 -1
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +66 -48
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h +549 -0
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +102 -21
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +45 -195
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +399 -140
package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
package/lib/module/api/Database.js +0 -92
package/lib/module/api/Database.js.map +1 -1
package/lib/module/classes/CactusLM.js +33 -15
package/lib/module/classes/CactusLM.js.map +1 -1
package/lib/module/classes/CactusSTT.js +90 -15
package/lib/module/classes/CactusSTT.js.map +1 -1
package/lib/module/hooks/useCactusLM.js +14 -5
package/lib/module/hooks/useCactusLM.js.map +1 -1
package/lib/module/hooks/useCactusSTT.js +100 -4
package/lib/module/hooks/useCactusSTT.js.map +1 -1
package/lib/module/index.js.map +1 -1
package/lib/module/models.js +336 -0
package/lib/module/models.js.map +1 -0
package/lib/module/native/Cactus.js +37 -0
package/lib/module/native/Cactus.js.map +1 -1
package/lib/module/types/CactusLM.js +2 -0
package/lib/module/types/CactusSTT.js +2 -0
package/lib/module/types/common.js +2 -0
package/lib/module/types/{CactusModel.js.map → common.js.map} +1 -1
package/lib/typescript/src/api/Database.d.ts +0 -6
package/lib/typescript/src/api/Database.d.ts.map +1 -1
package/lib/typescript/src/classes/CactusLM.d.ts +7 -3
package/lib/typescript/src/classes/CactusLM.d.ts.map +1 -1
package/lib/typescript/src/classes/CactusSTT.d.ts +13 -4
package/lib/typescript/src/classes/CactusSTT.d.ts.map +1 -1
package/lib/typescript/src/hooks/useCactusLM.d.ts +2 -2
package/lib/typescript/src/hooks/useCactusLM.d.ts.map +1 -1
package/lib/typescript/src/hooks/useCactusSTT.d.ts +12 -4
package/lib/typescript/src/hooks/useCactusSTT.d.ts.map +1 -1
package/lib/typescript/src/index.d.ts +2 -3
package/lib/typescript/src/index.d.ts.map +1 -1
package/lib/typescript/src/models.d.ts +6 -0
package/lib/typescript/src/models.d.ts.map +1 -0
package/lib/typescript/src/native/Cactus.d.ts +6 -1
package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
package/lib/typescript/src/specs/Cactus.nitro.d.ts +5 -0
package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
package/lib/typescript/src/types/CactusLM.d.ts +2 -0
package/lib/typescript/src/types/CactusLM.d.ts.map +1 -1
package/lib/typescript/src/types/CactusSTT.d.ts +20 -0
package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
package/lib/typescript/src/types/common.d.ts +28 -0
package/lib/typescript/src/types/common.d.ts.map +1 -0
package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +5 -0
package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +5 -0
package/package.json +1 -1
package/src/api/Database.ts +0 -133
package/src/classes/CactusLM.ts +49 -17
package/src/classes/CactusSTT.ts +118 -17
package/src/hooks/useCactusLM.ts +25 -5
package/src/hooks/useCactusSTT.ts +117 -5
package/src/index.tsx +6 -2
package/src/models.ts +344 -0
package/src/native/Cactus.ts +55 -0
package/src/specs/Cactus.nitro.ts +5 -0
package/src/types/CactusLM.ts +3 -0
package/src/types/CactusSTT.ts +26 -0
package/src/types/common.ts +28 -0
package/lib/module/types/CactusModel.js +0 -2
package/lib/module/types/CactusSTTModel.js +0 -2
package/lib/module/types/CactusSTTModel.js.map +0 -1
package/lib/typescript/src/types/CactusModel.d.ts +0 -13
package/lib/typescript/src/types/CactusModel.d.ts.map +0 -1
package/lib/typescript/src/types/CactusSTTModel.d.ts +0 -8
package/lib/typescript/src/types/CactusSTTModel.d.ts.map +0 -1
package/src/types/CactusModel.ts +0 -15
package/src/types/CactusSTTModel.ts +0 -10

package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h CHANGED Viewed

@@ -2,6 +2,13 @@
 #define KERNEL_UTILS_H
 #include <arm_neon.h>
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+#if defined(__ANDROID__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
 #include <algorithm>
 #include <cmath>
 #include <thread>
@@ -19,166 +26,439 @@
 #include <cstdio>
 constexpr size_t NEON_VECTOR_SIZE = 16;
+constexpr size_t STREAMING_STORE_THRESHOLD = 32768;
-inline int8_t clamp_to_int8(float value) {
-    int32_t clamped = static_cast<int32_t>(roundf(value));
-    return static_cast<int8_t>(std::max(-128, std::min(127, clamped)));
-}
-inline int8_t clamp_to_int8(int32_t value) {
-    return static_cast<int8_t>(std::max(-128, std::min(127, value)));
+inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
+#if defined(__aarch64__)
+    float16x4_t lo = vget_low_f16(val);
+    float16x4_t hi = vget_high_f16(val);
+    __asm__ __volatile__(
+        "stnp %d0, %d1, [%2]"
+        :
+        : "w"(lo), "w"(hi), "r"(dst)
+        : "memory"
+    );
+#else
+    vst1q_f16(dst, val);
+#endif
 }
 #if defined(__ARM_FEATURE_DOTPROD)
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     return vdotq_s32(acc, a, b);
 }
 #else
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     int16x8_t prod_low = vmull_s8(vget_low_s8(a), vget_low_s8(b));
     int32x4_t acc_high = vpaddlq_s16(vmull_s8(vget_high_s8(a), vget_high_s8(b)));
     return vaddq_s32(vaddq_s32(acc, vpaddlq_s16(prod_low)), acc_high);
 }
 #endif
-inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high,
+// I8MM support: runtime detection on Android, compile-time on Apple
+#if defined(__ANDROID__) && defined(__aarch64__)
+inline bool cactus_has_i8mm() {
+    static int8_t supported = -1;
+    if (supported == -1) {
+        unsigned long hwcaps = getauxval(AT_HWCAP2);
+        supported = (hwcaps & HWCAP2_I8MM) ? 1 : 0;
+    }
+    return supported;
+}
+__attribute__((target("arch=armv8.2-a+i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+#elif defined(__APPLE__) && defined(__aarch64__)
+inline bool cactus_has_i8mm() {
+    return true;
+}
+__attribute__((target("i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+#else
+inline bool cactus_has_i8mm() {
+    return false;
+}
+#endif
+inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high,
                                  float16x8_t b_low, float16x8_t b_high) {
     acc = vfmaq_f16(acc, a_low, b_low);
     return vfmaq_f16(acc, a_high, b_high);
 }
-inline float32x4_t accum_f32_dot(float32x4_t acc, float32x4_t a_low, float32x4_t a_high,
-                                  float32x4_t b_low, float32x4_t b_high) {
-    acc = vfmaq_f32(acc, a_low, b_low);
-    return vfmaq_f32(acc, a_high, b_high);
+inline float32x4_t fast_exp_f32x4(float32x4_t x) {
+    const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
+    const float32x4_t ln2 = vdupq_n_f32(0.6931471805599453f);
+    const float32x4_t c0 = vdupq_n_f32(1.0f);
+    const float32x4_t c1 = vdupq_n_f32(0.6931471805599453f);
+    const float32x4_t c2 = vdupq_n_f32(0.2402265069591007f);
+    const float32x4_t c3 = vdupq_n_f32(0.05550410866482158f);
+    const float32x4_t c4 = vdupq_n_f32(0.009618129842071803f);
+    x = vmaxq_f32(x, vdupq_n_f32(-87.0f));
+    x = vminq_f32(x, vdupq_n_f32(87.0f));
+    float32x4_t z = vmulq_f32(x, log2e);
+    int32x4_t zi = vcvtq_s32_f32(z);
+    float32x4_t zf = vsubq_f32(z, vcvtq_f32_s32(zi));
+    uint32x4_t neg_mask = vcltq_f32(zf, vdupq_n_f32(0.0f));
+    zi = vsubq_s32(zi, vandq_s32(vreinterpretq_s32_u32(neg_mask), vdupq_n_s32(1)));
+    zf = vaddq_f32(zf, vreinterpretq_f32_u32(vandq_u32(neg_mask, vreinterpretq_u32_f32(vdupq_n_f32(1.0f)))));
+    float32x4_t zf_ln2 = vmulq_f32(zf, ln2);
+    float32x4_t p = c4;
+    p = vfmaq_f32(c3, p, zf_ln2);
+    p = vfmaq_f32(c2, p, zf_ln2);
+    p = vfmaq_f32(c1, p, zf_ln2);
+    p = vfmaq_f32(c0, p, zf_ln2);
+    int32x4_t exp_bits = vshlq_n_s32(vaddq_s32(zi, vdupq_n_s32(127)), 23);
+    float32x4_t scale = vreinterpretq_f32_s32(exp_bits);
+    return vmulq_f32(p, scale);
+}
+inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t neg_one = vdupq_n_f32(-1.0f);
+    uint32x4_t pos_sat = vcgtq_f32(x, vdupq_n_f32(4.5f));
+    uint32x4_t neg_sat = vcltq_f32(x, vdupq_n_f32(-4.5f));
+    const float32x4_t c27 = vdupq_n_f32(27.0f);
+    const float32x4_t c9 = vdupq_n_f32(9.0f);
+    float32x4_t x2 = vmulq_f32(x, x);
+    float32x4_t num = vaddq_f32(c27, x2);
+    float32x4_t den = vfmaq_f32(c27, c9, x2);
+    float32x4_t result = vmulq_f32(x, vdivq_f32(num, den));
+    result = vbslq_f32(pos_sat, one, result);
+    result = vbslq_f32(neg_sat, neg_one, result);
+    return result;
+}
+inline int8x16_t unpack_int4_lo(uint8x16_t packed) {
+    uint8x16_t lo = vandq_u8(packed, vdupq_n_u8(0x0F));
+    uint8x16_t sign_mask = vcgtq_u8(lo, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(lo, correction));
+}
+inline int8x16_t unpack_int4_hi(uint8x16_t packed) {
+    uint8x16_t hi = vshrq_n_u8(packed, 4);
+    uint8x16_t sign_mask = vcgtq_u8(hi, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(hi, correction));
+}
+inline void unpack_int4_to_int8x32(uint8x16_t packed, int8x16_t& out_lo, int8x16_t& out_hi) {
+    int8x16_t lo_nibbles = unpack_int4_lo(packed);
+    int8x16_t hi_nibbles = unpack_int4_hi(packed);
+    int8x16x2_t interleaved = vzipq_s8(lo_nibbles, hi_nibbles);
+    out_lo = interleaved.val[0];
+    out_hi = interleaved.val[1];
+}
+inline int32x4_t int4_dot_asm(int32x4_t acc, uint8x16_t packed, int8x16_t a_lo, int8x16_t a_hi) {
+#if defined(__aarch64__)
+    int8x16_t b_lo, b_hi;
+    __asm__ __volatile__ (
+        "movi   v16.16b, #0x0F          \n"  // low nibble mask
+        "movi   v17.16b, #7             \n"  // sign threshold
+        "movi   v18.16b, #16            \n"  // sign correction
+        "and    %[b_lo].16b, %[packed].16b, v16.16b  \n"
+        "ushr   %[b_hi].16b, %[packed].16b, #4      \n"
+        "cmgt   v19.16b, %[b_lo].16b, v17.16b       \n"
+        "and    v19.16b, v19.16b, v18.16b           \n"
+        "sub    %[b_lo].16b, %[b_lo].16b, v19.16b   \n"
+        "cmgt   v20.16b, %[b_hi].16b, v17.16b       \n"
+        "and    v20.16b, v20.16b, v18.16b           \n"
+        "sub    %[b_hi].16b, %[b_hi].16b, v20.16b   \n"
+        "zip1   v21.16b, %[b_lo].16b, %[b_hi].16b   \n"
+        "zip2   v22.16b, %[b_lo].16b, %[b_hi].16b   \n"
+        ".arch armv8.2-a+dotprod                    \n"
+        "sdot   %[acc].4s, %[a_lo].16b, v21.16b     \n"
+        "sdot   %[acc].4s, %[a_hi].16b, v22.16b     \n"
+        : [acc] "+w"(acc), [b_lo] "=w"(b_lo), [b_hi] "=w"(b_hi)
+        : [packed] "w"(packed), [a_lo] "w"(a_lo), [a_hi] "w"(a_hi)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22"
+    );
+    return acc;
+#else
+    int8x16_t b_lo, b_hi;
+    unpack_int4_to_int8x32(packed, b_lo, b_hi);
+    acc = accum_dot(acc, a_lo, b_lo);
+    acc = accum_dot(acc, a_hi, b_hi);
+    return acc;
+#endif
+}
+inline int32_t int4_dot_m1_asm(const int8_t* a_ptr, const uint8_t* b_packed, size_t group_size) {
+#if defined(__aarch64__)
+    int32x4_t acc = vdupq_n_s32(0);
+    for (size_t k = 0; k < group_size; k += 64) {
+        uint8x16_t p0 = vld1q_u8(b_packed + k/2);
+        uint8x16_t p1 = vld1q_u8(b_packed + k/2 + 16);
+        int8x16_t a0 = vld1q_s8(a_ptr + k);
+        int8x16_t a1 = vld1q_s8(a_ptr + k + 16);
+        int8x16_t a2 = vld1q_s8(a_ptr + k + 32);
+        int8x16_t a3 = vld1q_s8(a_ptr + k + 48);
+        acc = int4_dot_asm(acc, p0, a0, a1);
+        acc = int4_dot_asm(acc, p1, a2, a3);
+    }
+    return vaddvq_s32(acc);
+#else
+    int32x4_t acc = vdupq_n_s32(0);
+    for (size_t k = 0; k < group_size; k += 32) {
+        uint8x16_t packed = vld1q_u8(b_packed + k/2);
+        int8x16_t b_lo, b_hi;
+        unpack_int4_to_int8x32(packed, b_lo, b_hi);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k), b_lo);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k + 16), b_hi);
+    }
+    return vaddvq_s32(acc);
+#endif
 }
 namespace CactusThreading {
     class ThreadPool {
     private:
+        static constexpr size_t MAX_WORKERS = 16;
         std::vector<std::thread> workers;
-        std::queue<std::function<void()>> tasks;
-        std::mutex queue_mutex;
-        std::condition_variable condition;
-        std::atomic<bool> stop{false};
-        std::atomic<size_t> active_workers{0};
-        std::condition_variable finish_condition;
+        std::deque<std::function<void()>> tasks;
+        std::mutex mutex;
+        std::condition_variable work_available;
+        std::condition_variable work_done;
+        bool stop{false};
+        std::atomic<size_t> pending_tasks{0};
+        size_t num_workers_;
         void worker_thread() {
             while (true) {
                 std::function<void()> task;
                 {
-                    std::unique_lock<std::mutex> lock(queue_mutex);
-                    condition.wait(lock, [this] { return stop || !tasks.empty(); });
-                    if (stop && tasks.empty()) return;
+                    std::unique_lock<std::mutex> lock(mutex);
+                    work_available.wait(lock, [this] {
+                        return stop || !tasks.empty();
+                    });
+                    if (stop && tasks.empty()) {
+                        return;
+                    }
                     task = std::move(tasks.front());
-                    tasks.pop();
-                    active_workers++;
+                    tasks.pop_front();
                 }
                 task();
-                active_workers--;
-                finish_condition.notify_all();
+                if (pending_tasks.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+                    std::lock_guard<std::mutex> lock(mutex);
+                    work_done.notify_one();
+                }
             }
         }
     public:
-        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency()) {
-            workers.reserve(num_threads);
-            for (size_t i = 0; i < num_threads; ++i) {
+        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency())
+            : stop(false), pending_tasks(0) {
+            num_workers_ = std::min(num_threads, MAX_WORKERS);
+            if (num_workers_ == 0) num_workers_ = 1;
+            workers.reserve(num_workers_);
+            for (size_t i = 0; i < num_workers_; ++i) {
                 workers.emplace_back(&ThreadPool::worker_thread, this);
             }
         }
         ~ThreadPool() {
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
+                std::lock_guard<std::mutex> lock(mutex);
                 stop = true;
             }
-            condition.notify_all();
+            work_available.notify_all();
             for (auto& worker : workers) {
-                worker.join();
+                if (worker.joinable()) {
+                    worker.join();
+                }
             }
         }
         template<typename F>
         auto enqueue(F&& f) -> std::future<decltype(f())> {
             using return_type = decltype(f());
             auto task = std::make_shared<std::packaged_task<return_type()>>(
                 std::forward<F>(f)
             );
             std::future<return_type> res = task->get_future();
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
-                if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
-                tasks.emplace([task](){ (*task)(); });
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(1, std::memory_order_relaxed);
+                tasks.emplace_back([task](){ (*task)(); });
             }
-            condition.notify_one();
+            work_available.notify_one();
             return res;
         }
+        template<typename F>
+        void enqueue_batch(size_t total_work, F task_func) {
+            if (total_work == 0) return;
+            const size_t num_tasks = std::min(num_workers_, total_work);
+            const size_t per_worker = total_work / num_tasks;
+            const size_t remainder = total_work % num_tasks;
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_tasks, std::memory_order_relaxed);
+                for (size_t w = 0; w < num_tasks; ++w) {
+                    size_t start = w * per_worker + std::min(w, remainder);
+                    size_t end = start + per_worker + (w < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
         void wait_all() {
-            std::unique_lock<std::mutex> lock(queue_mutex);
-            finish_condition.wait(lock, [this] {
-                return tasks.empty() && active_workers == 0;
+            std::unique_lock<std::mutex> lock(mutex);
+            work_done.wait(lock, [this] {
+                return pending_tasks.load(std::memory_order_acquire) == 0;
             });
         }
-        size_t num_workers() const { return workers.size(); }
+        template<typename F>
+        void enqueue_n_threads(size_t total_work, size_t num_threads, F task_func) {
+            if (total_work == 0 || num_threads == 0) return;
+            num_threads = std::min(num_threads, std::min(num_workers_, total_work));
+            const size_t per_thread = total_work / num_threads;
+            const size_t remainder = total_work % num_threads;
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_threads, std::memory_order_relaxed);
+                for (size_t t = 0; t < num_threads; ++t) {
+                    size_t start = t * per_thread + std::min(t, remainder);
+                    size_t end = start + per_thread + (t < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
+        size_t num_workers() const { return num_workers_; }
     };
     inline ThreadPool& get_thread_pool() {
         static ThreadPool pool;
         return pool;
     }
-    inline size_t get_optimal_thread_count(size_t total_work, size_t min_work_per_thread) {
-        if (total_work < min_work_per_thread) return 1;
+    struct ParallelConfig {
+        size_t min_work_gate;
+        size_t work_per_thread;
+        constexpr ParallelConfig(size_t gate, size_t per_thread)
+            : min_work_gate(gate), work_per_thread(per_thread) {}
+    };
+    inline size_t get_optimal_thread_count(size_t total_work, ParallelConfig config) {
+        if (total_work < config.min_work_gate) return 1;
         size_t pool_size = get_thread_pool().num_workers();
-        return std::min(pool_size,
-                       std::max(static_cast<size_t>(1), total_work / min_work_per_thread));
+        size_t num_threads = (total_work + config.work_per_thread - 1) / config.work_per_thread;
+        return std::min(pool_size, std::max(static_cast<size_t>(1), num_threads));
     }
     struct Thresholds {
+        #if defined(__ANDROID__)
+        static constexpr ParallelConfig ATTENTION{64, 32};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{30000, 15000};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{10000, 5000};
+        #else // Apple
+        static constexpr ParallelConfig ATTENTION{32, 16};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{5000, 2500};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{2500, 1250};
+        #endif
+    };
+    struct GemmThreading {
         #if defined(__ANDROID__)
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 30000;
-        static constexpr size_t SCALAR_EXPENSIVE = 10000;
-        static constexpr size_t ATTENTION = 512;
-        static constexpr size_t GEMM_TILED = 20000;
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
-        #else // iOS
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 5000;
-        static constexpr size_t SCALAR_EXPENSIVE = 2500;
-        static constexpr size_t ATTENTION = 4;
-        static constexpr size_t GEMM_TILED = 4;
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return 1;
+            return pool_size;
+        }
+        #elif defined(__APPLE__) && TARGET_OS_IPHONE
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(2));
+            return pool_size;
+        }
+        #else // Mac
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(4));
+            return pool_size;
+        }
         #endif
-        static constexpr size_t L2_CACHE_SIZE = 256 * 1024;
     };
+    inline size_t& get_gemm_thread_override() {
+        static size_t override_threads = 0;
+        return override_threads;
+    }
+    inline void set_gemm_threads(size_t num_threads) {
+        get_gemm_thread_override() = num_threads;
+    }
+    inline void reset_gemm_threads() {
+        get_gemm_thread_override() = 0;
+    }
     class TaskHandle {
     private:
@@ -225,10 +505,10 @@ namespace CactusThreading {
     };
     template<typename WorkFunc>
-    TaskHandle parallel_for(size_t total_work, size_t threshold, WorkFunc work_func, bool wait = true) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
-        TaskHandle handle(!wait);
+    TaskHandle parallel_for(size_t total_work, ParallelConfig config, WorkFunc work_func, bool wait = true) {
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
+        TaskHandle handle(!wait);
         if (num_threads == 1) {
             if (wait) {
                 work_func(0, total_work);
@@ -240,10 +520,10 @@ namespace CactusThreading {
             }));
             return handle;
         }
         auto& pool = get_thread_pool();
         const size_t work_per_thread = total_work / num_threads;
         for (size_t t = 0; t < num_threads; ++t) {
             handle.add_future(pool.enqueue([work_func, t, num_threads, work_per_thread, total_work]() {
                 const size_t start_idx = t * work_per_thread;
@@ -251,17 +531,17 @@ namespace CactusThreading {
                 work_func(start_idx, end_idx);
             }));
         }
         if (wait) {
             handle.wait();
         }
         return handle;
     }
     template<typename WorkFunc>
-    void parallel_for_2d(size_t outer_size, size_t inner_size, size_t threshold, WorkFunc work_func) {
+    void parallel_for_2d(size_t outer_size, size_t inner_size, ParallelConfig config, WorkFunc work_func) {
         const size_t total_work = outer_size * inner_size;
-        parallel_for(total_work, threshold, [&](size_t start_idx, size_t end_idx) {
+        parallel_for(total_work, config, [&](size_t start_idx, size_t end_idx) {
             for (size_t work_idx = start_idx; work_idx < end_idx; ++work_idx) {
                 const size_t outer = work_idx / inner_size;
                 const size_t inner = work_idx % inner_size;
@@ -269,11 +549,11 @@ namespace CactusThreading {
             }
         });
     }
     template<typename WorkFunc, typename ResultType, typename CombineFunc>
-    ResultType parallel_reduce(size_t total_work, size_t threshold,
+    ResultType parallel_reduce(size_t total_work, ParallelConfig config,
                               WorkFunc work_func, ResultType init_value, CombineFunc combine_func) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
         if (num_threads == 1) {
             return work_func(0, total_work);
@@ -298,46 +578,25 @@ namespace CactusThreading {
         }
         return result;
     }
-    inline size_t compute_gemm_parallelism(size_t M, size_t K, size_t N, size_t element_size) {
-        size_t total_ops = M * K * N;
-        if (total_ops < Thresholds::GEMM_SMALL) return 1;
-        if (total_ops < Thresholds::GEMM_MEDIUM) {
-            return std::min(static_cast<size_t>(2), get_thread_pool().num_workers());
-        }
-        size_t bytes_accessed = (M * K + K * N + M * N) * element_size;
-        size_t cache_tiles = (bytes_accessed + Thresholds::L2_CACHE_SIZE - 1) / Thresholds::L2_CACHE_SIZE;
-        size_t compute_threads = std::sqrt(static_cast<double>(total_ops) / Thresholds::GEMM_SMALL);
-        size_t memory_threads = cache_tiles;
-        size_t optimal = std::min(compute_threads, memory_threads);
-        return std::min(optimal, get_thread_pool().num_workers());
-    }
     template<typename WorkFunc>
-    void parallel_for_2d_tiled(size_t rows, size_t cols, size_t tile_rows, size_t tile_cols, WorkFunc work_func) {
-        size_t num_row_tiles = (rows + tile_rows - 1) / tile_rows;
-        size_t num_col_tiles = (cols + tile_cols - 1) / tile_cols;
-        size_t total_tiles = num_row_tiles * num_col_tiles;
-        parallel_for(total_tiles, Thresholds::GEMM_TILED, [=](size_t start_tile, size_t end_tile) {
-            for (size_t tile_idx = start_tile; tile_idx < end_tile; ++tile_idx) {
-                size_t tile_row = tile_idx / num_col_tiles;
-                size_t tile_col = tile_idx % num_col_tiles;
-                size_t row_start = tile_row * tile_rows;
-                size_t row_end = std::min(row_start + tile_rows, rows);
-                size_t col_start = tile_col * tile_cols;
-                size_t col_end = std::min(col_start + tile_cols, cols);
-                work_func(row_start, row_end, col_start, col_end);
-            }
-        });
+    void parallel_gemm_tiles(size_t M, size_t total_tiles, WorkFunc work_func) {
+        auto& pool = get_thread_pool();
+        size_t override = get_gemm_thread_override();
+        size_t num_threads = (override > 0) ? override : GemmThreading::get_num_threads(M, pool.num_workers());
+        num_threads = std::min(num_threads, total_tiles);
+        if (num_threads <= 1) {
+            work_func(0, total_tiles);
+            return;
+        }
+        pool.enqueue_n_threads(total_tiles, num_threads, work_func);
+        pool.wait_all();
     }
 }
 #endif // KERNEL_UTILS_H

package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus CHANGED Viewed

Binary file