npm - cactus-react-native - Versions diffs - 1.10.3 → 1.12.0 - Mend

cactus-react-native 1.10.3 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h CHANGED Viewed

@@ -122,13 +122,14 @@ enum class Activation {
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
+    ABS, POW, FLATTEN, VIEW,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
     RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
     SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
     RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
-    SAMPLE, CONCAT,
+    SAMPLE, CONCAT, CAT,
     SCATTER_TOPK,
     TOPK, LAYERNORM, GROUPNORM,
     MOE_LAYER,
@@ -136,7 +137,17 @@ enum class OpType {
     PERSISTENT,
     QUANTIZE_ACTIVATIONS,
     LSTM_CELL,
-    STFT
+    GATED_DELTANET_DECODE,
+    GATED_DELTANET_PREFILL,
+    STFT,
+    ALTUP_PREDICT,
+    ALTUP_CORRECT,
+    GAUSSIAN_TOPK,
+    MAXPOOL1D,
+    BILSTM_SEQUENCE,
+    LEAKY_RELU,
+    CONV2D_K3S1P1,
+    STATS_POOL
 };
 struct PrecisionTraits {
@@ -315,6 +326,7 @@ struct OpParams {
     size_t window_size = 0;
     bool is_causal = true;
     bool attention_mask_is_additive = false;
+    float logit_cap = 0.0f;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -350,6 +362,10 @@ struct OpParams {
     size_t num_kv_heads = 0;
     size_t head_dim = 0;
     size_t num_fft_bins = 0;
+    size_t chunk_size = 0;
+    size_t num_altup_inputs = 0;
+    size_t v_head_dim = 0;
+    size_t kernel_size = 0;
 };
 struct GraphNode {
@@ -362,6 +378,28 @@ struct GraphNode {
     GraphNode(size_t node_id, OpType type);
 };
+using nodes_vector = std::vector<std::unique_ptr<GraphNode>>;
+using node_index_map_t = std::unordered_map<size_t, size_t>;
+inline const BufferDesc& get_input(const GraphNode& node, size_t idx,
+                                   const nodes_vector& nodes,
+                                   const node_index_map_t& node_index_map) {
+    return nodes[node_index_map.at(node.input_ids[idx])]->output_buffer;
+}
+struct AxisDims {
+    size_t outer, axis_size, inner;
+    static AxisDims from_shape(const std::vector<size_t>& shape, size_t axis) {
+        AxisDims d;
+        d.outer = 1;
+        for (size_t i = 0; i < axis; i++) d.outer *= shape[i];
+        d.axis_size = shape[axis];
+        d.inner = 1;
+        for (size_t i = axis + 1; i < shape.size(); i++) d.inner *= shape[i];
+        return d;
+    }
+};
 template<typename T>
 void dispatch_binary_op(OpType op, const T* lhs, const T* rhs, T* output, size_t count);
@@ -383,6 +421,14 @@ void compute_groupnorm_node(GraphNode& node, const std::vector<std::unique_ptr<G
 void compute_persistent_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_decode_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_gated_deltanet_prefill_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_predict_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_altup_correct_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_maxpool1d_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_bilstm_sequence_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_conv2d_k3s1p1_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_stats_pool_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void shrink_thread_local_buffers();
 class BufferPool {
@@ -437,7 +483,6 @@ public:
     size_t multiply(size_t input1, size_t input2);
     size_t divide(size_t input1, size_t input2);
     size_t scalar_add(size_t input, float value);
     size_t scalar_subtract(size_t input, float value);
     size_t scalar_multiply(size_t input, float value);
@@ -455,6 +500,11 @@ public:
     size_t sigmoid(size_t input);
     size_t tanh(size_t input);
     size_t glu(size_t input, int axis = -1);
+    size_t abs(size_t input);
+    size_t pow(size_t input, float exponent);
+    size_t view(size_t input, const std::vector<size_t>& new_shape);
+    size_t flatten(size_t input, int start_dim = 0, int end_dim = -1);
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -497,7 +547,9 @@ public:
                      size_t num_experts_per_tok,
                      bool normalize_routing,
                      float epsilon,
-                     float routed_scaling_factor);
+                     float routed_scaling_factor,
+                     Activation activation = Activation::SILU,
+                     size_t per_expert_scale = 0);
     size_t moe_layer(size_t hidden,
                      size_t routing_probs,
                      size_t topk_indices,
@@ -518,13 +570,15 @@ public:
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
                             bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
-                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0,
+                            float logit_cap = 0.0f);
     size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
                                  const float* k_scales, const float* v_scales,
-                                 size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size = 0);
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim,
+                                 size_t window_size = 0, size_t v_head_dim = 0);
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
@@ -543,12 +597,30 @@ public:
     size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
     size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
+    size_t gated_deltanet_decode(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                 size_t initial_state, float scale = 0.0f);
+    size_t gated_deltanet_prefill(size_t query, size_t key, size_t value, size_t gate_log, size_t beta,
+                                  size_t initial_state, size_t chunk_size = 64, float scale = 0.0f);
     size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
+    size_t altup_predict(size_t coefs, const size_t* streams, size_t num_streams);
+    size_t altup_correct(size_t coefs, size_t innovation, const size_t* predictions, size_t num_predictions);
+    size_t gaussian_topk(size_t input, float ppf);
+    size_t maxpool1d(size_t input, size_t kernel_size, size_t stride);
+    size_t leaky_relu(size_t input, float negative_slope = 0.01f);
+    size_t bilstm_sequence(size_t input, size_t w_ih_fwd, size_t w_hh_fwd, size_t b_ih_fwd, size_t b_hh_fwd,
+                           size_t w_ih_bwd, size_t w_hh_bwd, size_t b_ih_bwd, size_t b_hh_bwd);
+    size_t conv2d_k3s1p1(size_t input, size_t weight);
+    size_t conv2d_k3s1p1(size_t input, size_t weight, size_t bias);
+    size_t stats_pool(size_t input);
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
     size_t concat(size_t input1, size_t input2, int axis = 0);
+    size_t cat(const std::vector<size_t>& inputs, int axis);
     size_t scatter_topk(size_t indices, size_t values, size_t num_classes);
     void set_input(size_t node_id, const void* data, Precision precision);
@@ -653,4 +725,4 @@ namespace GraphFile {
     };
 }
-#endif
+#endif

package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h CHANGED Viewed

@@ -11,7 +11,9 @@ enum class ScalarOpType {
     SUBTRACT,
     MULTIPLY,
     DIVIDE,
+    ABS,
     EXP,
+    POW,
     SQRT,
     COS,
     SIN,
@@ -54,6 +56,14 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
                         const int8_t* B, const __fp16* B_scales,
                         __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+void cactus_gemv_int8_i8mm(const int8_t* A, float A_scale,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t K, size_t N, size_t group_size);
+void cactus_gemm_int8_i8mm(const int8_t* A, const float* A_scales,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 void cactus_gemv_int4(const int8_t* A, float A_scale,
                       const int8_t* B_packed, const __fp16* B_scales,
                       __fp16* C, size_t K, size_t N, size_t group_size);
@@ -97,6 +107,9 @@ void cactus_max_axis_f16(const __fp16* input, __fp16* output, size_t outer_size,
 void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* output,
                           size_t batch_size, size_t dims, float eps);
+void cactus_layer_norm_f16(const __fp16* input, const __fp16* weight, const __fp16* bias,
+                            __fp16* output, size_t batch_size, size_t dims, float eps);
 void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
                       size_t num_heads, size_t head_dim, size_t start_pos, float theta);
@@ -108,6 +121,8 @@ void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
 void cactus_relu_f16(const __fp16* input, __fp16* output, size_t num_elements);
+void cactus_leaky_relu_f16(const __fp16* input, __fp16* output, size_t num_elements, float negative_slope);
 void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
 void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
@@ -163,21 +178,54 @@ void cactus_batchnorm_f32(
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
+                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false,
+                          size_t v_head_dim = 0, float logit_cap = 0.0f);
 void cactus_attention_hybrid_int8_fp16(
-    const __fp16* queries,
-    const int8_t* keys_cached,
-    const int8_t* values_cached,
+    const __fp16* queries,
+    const int8_t* keys_cached,
+    const int8_t* values_cached,
     const float* k_scales,
-    const float* v_scales,
-    const __fp16* keys_new,
-    const __fp16* values_new,
+    const float* v_scales,
+    const __fp16* keys_new,
+    const __fp16* values_new,
     __fp16* output,
     size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
     size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
     float scale, size_t position_offset = 0, bool is_causal = true, size_t window_size = 0,
-    size_t group_size = KV_QUANT_GROUP_SIZE);
+    size_t group_size = KV_QUANT_GROUP_SIZE, size_t v_head_dim = 0);
+void cactus_gated_deltanet_decode_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    float scale);
+void cactus_gated_deltanet_prefill_f16(
+    const __fp16* q_data,
+    const __fp16* k_data,
+    const __fp16* v_data,
+    const __fp16* g_data,
+    const __fp16* b_data,
+    const __fp16* s_data,
+    __fp16* out,
+    size_t B,
+    size_t T,
+    size_t Hq,
+    size_t Hv,
+    size_t K,
+    size_t V,
+    size_t requested_chunk_size,
+    float scale);
 void cactus_conv1d_causal_depthwise_f16(
     const __fp16* input,
@@ -244,6 +292,18 @@ void cactus_conv1d_same_depthwise_f16_k9(
     size_t C
 );
+void cactus_conv2d_f16_k3s1p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
 void cactus_conv2d_f16_k3s2p1_nchw(
     const __fp16* input,
     const __fp16* weight,
@@ -305,6 +365,8 @@ void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size
 void cactus_concat_f16(const __fp16* input1, const __fp16* input2, __fp16* output,
                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
                        size_t ndims, int axis);
+void cactus_cat_f16(const __fp16** inputs, __fp16* output, const size_t** input_shapes,
+                      const size_t* output_shape, size_t num_inputs, size_t rank, int axis);
 void cactus_int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
 void cactus_fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
@@ -328,6 +390,30 @@ inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim,
 void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
+void cactus_gaussian_topk_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t rows,
+    size_t cols,
+    float ppf);
+void cactus_altup_predict_f16(
+    const __fp16* coefs,
+    const __fp16* const* streams,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
+void cactus_altup_correct_f16(
+    const __fp16* coefs,
+    const __fp16* innovation,
+    const __fp16* const* predictions,
+    __fp16* output,
+    size_t n,
+    size_t seq_len,
+    size_t hidden_dim);
 void cactus_lstm_cell_f16(
     const __fp16* x_input,
     const __fp16* h_prev,
@@ -343,4 +429,31 @@ void cactus_lstm_cell_f16(
     size_t hidden_size
 );
-#endif
+void cactus_bilstm_sequence_f16(
+    const __fp16* input,
+    const __fp16* weight_ih_fwd,
+    const __fp16* weight_hh_fwd,
+    const __fp16* bias_ih_fwd,
+    const __fp16* bias_hh_fwd,
+    const __fp16* weight_ih_bwd,
+    const __fp16* weight_hh_bwd,
+    const __fp16* bias_ih_bwd,
+    const __fp16* bias_hh_bwd,
+    __fp16* output,
+    size_t batch_size,
+    size_t seq_len,
+    size_t input_size,
+    size_t hidden_size
+);
+void cactus_maxpool1d_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t batch_size,
+    size_t channels,
+    size_t input_length,
+    size_t kernel_size,
+    size_t stride
+);
+#endif

package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h CHANGED Viewed

@@ -9,6 +9,8 @@
 #if defined(__ANDROID__)
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
+#include <sched.h>
+#include <fstream>
 #endif
 #include <algorithm>
 #include <cmath>
@@ -44,6 +46,29 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
 #endif
 }
+inline bool cpu_has_i8mm() {
+#if defined(__aarch64__)
+    static std::once_flag once;
+    static bool has = false;
+    std::call_once(once, []() {
+#if defined(__APPLE__)
+    has = true;
+#elif defined(__ANDROID__)
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    #ifndef HWCAP2_I8MM
+    #define HWCAP2_I8MM (1 << 13)
+    #endif
+    has = (hwcap2 & HWCAP2_I8MM) != 0;
+#endif
+    });
+    return has;
+#else
+    return false;
+#endif
+}
 inline bool cpu_has_sme2() {
 #if defined(__aarch64__)
 	static std::once_flag once;
@@ -130,6 +155,33 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
     return result;
 }
+constexpr size_t SIMD_F16_WIDTH = 8;
+inline size_t simd_align(size_t count, size_t width = SIMD_F16_WIDTH) {
+    return (count / width) * width;
+}
+inline void f16x8_split_f32(float16x8_t v, float32x4_t& lo, float32x4_t& hi) {
+    lo = vcvt_f32_f16(vget_low_f16(v));
+    hi = vcvt_f32_f16(vget_high_f16(v));
+}
+inline float16x8_t f32_merge_f16(float32x4_t lo, float32x4_t hi) {
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));
+}
+inline float32x4_t fast_sigmoid_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    return vdivq_f32(one, vaddq_f32(one, fast_exp_f32x4(vnegq_f32(x))));
+}
+template<typename F32x4Op>
+inline float16x8_t apply_f32_op_on_f16x8(float16x8_t v, F32x4Op op) {
+    float32x4_t lo, hi;
+    f16x8_split_f32(v, lo, hi);
+    return f32_merge_f16(op(lo), op(hi));
+}
 inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
     int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
     high_decoded = vshrq_n_s8(packed, 4);
@@ -138,6 +190,80 @@ inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded
 namespace CactusThreading {
+#if defined(__ANDROID__)
+    struct CoreTopology {
+        std::vector<int> performance_cores;
+        std::vector<int> all_cores;
+        static CoreTopology& get() {
+            static CoreTopology topo = detect();
+            return topo;
+        }
+    private:
+        static int read_sysfs_int(const char* path) {
+            std::ifstream f(path);
+            if (!f.is_open()) return -1;
+            int val = -1;
+            f >> val;
+            return val;
+        }
+        static CoreTopology detect() {
+            CoreTopology topo;
+            constexpr int MAX_CPUS = 16;
+            std::vector<std::pair<int, int>> core_caps;
+            for (int i = 0; i < MAX_CPUS; ++i) {
+                char path[128];
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpu_capacity", i);
+                int cap = read_sysfs_int(path);
+                if (cap > 0) {
+                    core_caps.push_back({i, cap});
+                    topo.all_cores.push_back(i);
+                    continue;
+                }
+                snprintf(path, sizeof(path),
+                         "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", i);
+                int freq = read_sysfs_int(path);
+                if (freq > 0) {
+                    core_caps.push_back({i, freq});
+                    topo.all_cores.push_back(i);
+                }
+            }
+            if (core_caps.empty()) return topo;
+            int max_cap = 0;
+            for (auto& [id, cap] : core_caps) {
+                max_cap = std::max(max_cap, cap);
+            }
+            int threshold = static_cast<int>(max_cap * 0.70);
+            for (auto& [id, cap] : core_caps) {
+                if (cap >= threshold) {
+                    topo.performance_cores.push_back(id);
+                }
+            }
+            return topo;
+        }
+    };
+    inline bool pin_current_thread_to_cores(const std::vector<int>& cores) {
+        if (cores.empty()) return false;
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+        for (int core : cores) {
+            CPU_SET(core, &mask);
+        }
+        return sched_setaffinity(0, sizeof(mask), &mask) == 0;
+    }
+#endif
     class ThreadPool {
     private:
         static constexpr size_t MAX_WORKERS = 16;
@@ -184,9 +310,25 @@ namespace CactusThreading {
             : stop(false), pending_tasks(0) {
             num_workers_ = std::min(num_threads, MAX_WORKERS);
             if (num_workers_ == 0) num_workers_ = 1;
+#if defined(__ANDROID__)
+            auto& topo = CoreTopology::get();
+            if (!topo.performance_cores.empty()) {
+                num_workers_ = std::min(num_workers_, topo.performance_cores.size());
+            }
+#endif
             workers.reserve(num_workers_);
             for (size_t i = 0; i < num_workers_; ++i) {
-                workers.emplace_back(&ThreadPool::worker_thread, this);
+                workers.emplace_back([this]() {
+#if defined(__ANDROID__)
+                    auto& perf = CoreTopology::get().performance_cores;
+                    if (!perf.empty()) {
+                        pin_current_thread_to_cores(perf);
+                    }
+#endif
+                    worker_thread();
+                });
             }
         }
@@ -498,5 +640,52 @@ namespace CactusThreading {
 }
+template<typename SimdOp, typename ScalarOp>
+void elementwise_op_f16(const __fp16* input, __fp16* output, size_t num_elements,
+                        bool use_streaming, CactusThreading::ParallelConfig config,
+                        SimdOp simd_op, ScalarOp scalar_op, size_t unroll = 4) {
+    CactusThreading::parallel_for(num_elements, config,
+        [&](size_t start, size_t end) {
+            const size_t n = end - start;
+            const size_t vec_end = start + simd_align(n);
+            if (use_streaming && unroll >= 4) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 4);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 4) {
+                    __builtin_prefetch(&input[i + 256], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    float16x8_t v2 = simd_op(vld1q_f16(&input[i + 16]));
+                    float16x8_t v3 = simd_op(vld1q_f16(&input[i + 24]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                    stream_store_f16x8(&output[i + 16], v2);
+                    stream_store_f16x8(&output[i + 24], v3);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else if (use_streaming && unroll >= 2) {
+                const size_t unrolled_end = start + simd_align(n, SIMD_F16_WIDTH * 2);
+                for (size_t i = start; i < unrolled_end; i += SIMD_F16_WIDTH * 2) {
+                    __builtin_prefetch(&input[i + 128], 0, 0);
+                    float16x8_t v0 = simd_op(vld1q_f16(&input[i]));
+                    float16x8_t v1 = simd_op(vld1q_f16(&input[i + 8]));
+                    stream_store_f16x8(&output[i], v0);
+                    stream_store_f16x8(&output[i + 8], v1);
+                }
+                for (size_t i = unrolled_end; i < vec_end; i += SIMD_F16_WIDTH) {
+                    stream_store_f16x8(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            } else {
+                for (size_t i = start; i < vec_end; i += SIMD_F16_WIDTH) {
+                    vst1q_f16(&output[i], simd_op(vld1q_f16(&input[i])));
+                }
+            }
+            for (size_t i = vec_end; i < end; ++i) {
+                output[i] = scalar_op(input[i]);
+            }
+        });
+}
-#endif // KERNEL_UTILS_H
+#endif // KERNEL_UTILS_H

package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus CHANGED Viewed

Binary file