npm - whisper.rn - Versions diffs - 0.5.0 → 0.5.2 - Mend

whisper.rn 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/cpp/ggml-backend.h CHANGED Viewed

@@ -132,6 +132,8 @@ extern "C" {
         WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
         // GPU device using dedicated memory
         WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
         // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
         WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
@@ -150,11 +152,21 @@ extern "C" {
     // all the device properties
     struct wsp_ggml_backend_dev_props {
+        // device name
         const char * name;
+        // device description
         const char * description;
+        // device free memory in bytes
         size_t memory_free;
+        // device total memory in bytes
         size_t memory_total;
+        // device type
         enum wsp_ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
         struct wsp_ggml_backend_dev_caps caps;
     };
@@ -203,6 +215,8 @@ extern "C" {
     // Backend registry
     //
+    WSP_GGML_API void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg);
     WSP_GGML_API void wsp_ggml_backend_device_register(wsp_ggml_backend_dev_t device);
     // Backend (reg) enumeration
@@ -302,11 +316,15 @@ extern "C" {
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
-    WSP_GGML_API size_t               wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API size_t                     wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
     WSP_GGML_API void                 wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
     WSP_GGML_API wsp_ggml_backend_t       wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
+    // Split graph without allocating it
+    WSP_GGML_API void                 wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
     // Allocate and compute graph on the backend scheduler
     WSP_GGML_API bool                 wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
     WSP_GGML_API enum wsp_ggml_status     wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);

package/cpp/ggml-cpu/amx/amx.cpp CHANGED Viewed

@@ -7,7 +7,7 @@
 #include "ggml-cpu.h"
 #include "traits.h"
-#if defined(__gnu_linux__)
+#if defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif
@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
         if (op->op == WSP_GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
             is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
             op->src[0]->buffer && op->src[0]->buffer->buft == wsp_ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
             op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
             (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == WSP_GGML_TYPE_F16))) {
             // src1 must be host buffer
@@ -186,7 +187,7 @@ static size_t wsp_ggml_backend_amx_buffer_type_get_alloc_size(wsp_ggml_backend_b
 #define XFEATURE_XTILEDATA      18
 static bool wsp_ggml_amx_init() {
-#if defined(__gnu_linux__)
+#if defined(__linux__)
     if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
         fprintf(stderr, "AMX is not ready to be used!\n");
         return false;
@@ -194,6 +195,8 @@ static bool wsp_ggml_amx_init() {
     return true;
 #elif defined(_WIN32)
     return true;
+#else
+    return false;
 #endif
 }

package/cpp/ggml-cpu/arch/x86/repack.cpp CHANGED Viewed

@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
                 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
                 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
                 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
                 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
                 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
                 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
                 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
                 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
                 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
                 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);

package/cpp/ggml-cpu/arch-fallback.h CHANGED Viewed

@@ -73,7 +73,6 @@
 #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
 #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
 #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
-#define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
 #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -151,8 +150,6 @@
 #elif defined(__s390x__)
 // quants.c
 #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
-#define wsp_ggml_vec_dot_q5_0_q8_0_generic wsp_ggml_vec_dot_q5_0_q8_0
-#define wsp_ggml_vec_dot_q5_1_q8_1_generic wsp_ggml_vec_dot_q5_1_q8_1
 #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
 #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
 #define wsp_ggml_vec_dot_q2_K_q8_K_generic wsp_ggml_vec_dot_q2_K_q8_K
@@ -163,7 +160,6 @@
 #define wsp_ggml_vec_dot_iq3_s_q8_K_generic wsp_ggml_vec_dot_iq3_s_q8_K
 #define wsp_ggml_vec_dot_iq1_s_q8_K_generic wsp_ggml_vec_dot_iq1_s_q8_K
 #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
-#define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
 #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8

package/cpp/ggml-cpu/common.h CHANGED Viewed

@@ -28,6 +28,14 @@ static inline float bf16_to_f32(wsp_ggml_bf16_t x) {
     return WSP_GGML_BF16_TO_FP32(x);
 }
+static inline float i32_to_f32(int32_t x) {
+    return x;
+}
+static inline int32_t f32_to_i32(float x) {
+    return x;
+}
 static inline float f32_to_f32(float x) {
     return x;
 }
@@ -54,6 +62,12 @@ struct type_conversion_table<wsp_ggml_bf16_t> {
     static constexpr wsp_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
 };
+template <>
+struct type_conversion_table<int32_t> {
+    static constexpr float (*to_f32)(int32_t) = i32_to_f32;
+    static constexpr int32_t (*from_f32)(float) = f32_to_i32;
+};
 static std::pair<int64_t, int64_t> get_thread_range(const struct wsp_ggml_compute_params * params, const struct wsp_ggml_tensor * src0) {
     const int64_t ith = params->ith;
     const int64_t nth = params->nth;

package/cpp/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -68,13 +68,7 @@ struct wsp_ggml_compute_params {
 #endif  // __VXE2__
 #endif  // __s390x__ && __VEC__
-#if defined(__s390x__) && defined(WSP_GGML_NNPA)
-#ifndef __NNPA__
-#define __NNPA__
-#endif  // __NNPA__
-#endif  // __s390x__ && WSP_GGML_NNPA
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
 #include <sys/prctl.h>
 #endif
@@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
     return v_abo + v_abe;
 }
+/**
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
+ */
+inline static float vec_hsum_f32x4(float32x4_t v) {
+    float32x4_t v_temp = v + vec_reve(v);
+    return v_temp[0] + v_temp[1];
+}
+inline static int32_t vec_hsum_i32x4(int32x4_t v) {
+    int32x4_t v_temp = v + vec_reve(v);
+    return v_temp[0] + v_temp[1];
+}
 inline static int32x4_t wsp_ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
     return acc + (vec_unpackh(p) + vec_unpackl(p));

package/cpp/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -373,6 +373,9 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [WSP_GGML_TYPE_I32] = {
+        .from_float               = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_i32,
+    },
 };
 const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_ggml_type type) {
@@ -470,10 +473,10 @@ struct wsp_ggml_threadpool {
 struct wsp_ggml_compute_state {
 #ifndef WSP_GGML_USE_OPENMP
     wsp_ggml_thread_t thrd;
-    bool cpumask[WSP_GGML_MAX_N_THREADS];
     int  last_graph;
     bool pending;
 #endif
+    bool cpumask[WSP_GGML_MAX_N_THREADS];
     struct wsp_ggml_threadpool * threadpool;
     int ith;
 };
@@ -686,8 +689,13 @@ bool wsp_ggml_is_numa(void) {
 #endif
 static void wsp_ggml_init_arm_arch_features(void) {
-#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__linux__)
     wsp_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#else
+    // TODO: add support of SVE for non-linux systems
+#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
+#endif
 #endif
 }
@@ -1876,10 +1884,18 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             {
                 wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
             } break;
+        case WSP_GGML_OP_IM2COL_3D:
+            {
+                wsp_ggml_compute_forward_im2col_3d(params, tensor);
+            } break;
         case WSP_GGML_OP_CONV_2D:
             {
                 wsp_ggml_compute_forward_conv_2d(params, tensor);
             } break;
+        case WSP_GGML_OP_CONV_3D:
+            {
+                wsp_ggml_compute_forward_conv_3d(params, tensor);
+            } break;
         case WSP_GGML_OP_CONV_2D_DW:
             {
                 wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2168,6 +2184,10 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 case WSP_GGML_UNARY_OP_HARDSWISH:
                 case WSP_GGML_UNARY_OP_HARDSIGMOID:
                 case WSP_GGML_UNARY_OP_EXP:
+                case WSP_GGML_UNARY_OP_FLOOR:
+                case WSP_GGML_UNARY_OP_CEIL:
+                case WSP_GGML_UNARY_OP_ROUND:
+                case WSP_GGML_UNARY_OP_TRUNC:
                     {
                         n_tasks = 1;
                     } break;
@@ -2176,6 +2196,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 case WSP_GGML_UNARY_OP_GELU_ERF:
                 case WSP_GGML_UNARY_OP_GELU_QUICK:
                 case WSP_GGML_UNARY_OP_SILU:
+                case WSP_GGML_UNARY_OP_XIELU:
                     {
                         n_tasks = n_threads;
                     } break;
@@ -2251,7 +2272,9 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             } break;
         case WSP_GGML_OP_IM2COL:
         case WSP_GGML_OP_IM2COL_BACK:
+        case WSP_GGML_OP_IM2COL_3D:
         case WSP_GGML_OP_CONV_2D:
+        case WSP_GGML_OP_CONV_3D:
         case WSP_GGML_OP_CONV_2D_DW:
         case WSP_GGML_OP_CONV_TRANSPOSE_1D:
         case WSP_GGML_OP_CONV_TRANSPOSE_2D:
@@ -2686,7 +2709,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
                         if (wsp_ggml_is_quantized(node->type) ||
                             // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
                             (node->src[0]->type == WSP_GGML_TYPE_F16  && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
-                            (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)) {
+                            (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16) ||
+                            // conversion between F32 and I32
+                            (node->src[0]->type == WSP_GGML_TYPE_F32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_I32) ||
+                            (node->src[0]->type == WSP_GGML_TYPE_I32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F32)) {
                             cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
                         }
                     } break;
@@ -2773,6 +2799,7 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
                         }
                     } break;
                 case WSP_GGML_OP_CONV_2D:
+                case WSP_GGML_OP_CONV_3D:
                     {
                         cur = WSP_GGML_IM2COL_WORK_SIZE;
                     } break;
@@ -3064,7 +3091,14 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
     threadpool->workers = workers;
-#ifndef WSP_GGML_USE_OPENMP
+#ifdef WSP_GGML_USE_OPENMP
+    int32_t cpumask_iter = 0;
+    // Compute CPU masks for each thread
+    for (int j = 0; j < tpp->n_threads; j++) {
+        wsp_ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+    }
+#else // WSP_GGML_USE_OPENMP
     wsp_ggml_mutex_init(&threadpool->mutex);
     wsp_ggml_cond_init(&threadpool->cond);
@@ -3137,7 +3171,14 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
                 atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
             }
-            wsp_ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
+            // Apply thread CPU mask and priority
+            int ith = omp_get_thread_num();
+            wsp_ggml_thread_apply_priority(threadpool->prio);
+            if (wsp_ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
+                wsp_ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
+            }
+            wsp_ggml_graph_compute_thread(&threadpool->workers[ith]);
         }
     } else {
         atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -3200,20 +3241,12 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
         __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
         _mm_storel_epi64((__m128i *)(y + i), y_vec);
     }
-#elif defined(__NNPA__)
-    for (; i + 7 < n; i += 8) {
-        float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
-        float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
-        uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
-        vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
-    }
-    for (; i + 3 < n; i += 4) {
-        float32x4_t v_x = vec_xl(0, (const float *)(x + i));
-        float32x4_t v_zero = vec_splats(0.0f);
-        uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
-        vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
+#elif defined(__riscv_zvfh)
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
+        vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
+        __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
     }
 #endif
     for (; i < n; ++i) {
@@ -3241,21 +3274,6 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
         __m128 y_vec = _mm_cvtph_ps(x_vec);
         _mm_storeu_ps(y + i, y_vec);
     }
-#elif defined(__NNPA__)
-    for (; i + 7 < n; i += 8) {
-        uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
-        float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
-        vec_xst(v_yh, 0, (float *)(y + i + 0));
-        vec_xst(v_yl, 0, (float *)(y + i + 4));
-    }
-    for (; i + 3 < n; i += 4) {
-        uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
-        vec_xst(v_yh, 0, (float *)(y + i));
-    }
 #endif
     for (; i < n; ++i) {
@@ -3270,6 +3288,13 @@ void wsp_ggml_cpu_fp32_to_bf16(const float * x, wsp_ggml_bf16_t * y, int64_t n)
     }
 }
+void wsp_ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
+    int64_t i = 0;
+    for (; i < n; ++i) {
+        y[i] = x[i];
+    }
+}
 void wsp_ggml_cpu_bf16_to_fp32(const wsp_ggml_bf16_t * x, float * y, int64_t n) {
     int64_t i = 0;
 #if defined(__AVX2__)
@@ -3459,14 +3484,6 @@ int wsp_ggml_cpu_has_vxe(void) {
 #endif
 }
-int wsp_ggml_cpu_has_nnpa(void) {
-#if defined(WSP_GGML_NNPA)
-    return 1;
-#else
-    return 0;
-#endif
-}
 int wsp_ggml_cpu_has_neon(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_NEON)
     return 1;
@@ -3550,13 +3567,17 @@ void wsp_ggml_cpu_init(void) {
 #ifdef WSP_GGML_USE_OPENMP
             //if (!getenv("OMP_WAIT_POLICY")) {
             //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
             //}
             if (!getenv("KMP_BLOCKTIME")) {
                 // set the time to wait before sleeping a thread
                 // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
             }
 #endif
         }

package/cpp/ggml-cpu/ggml-cpu.cpp CHANGED Viewed

@@ -18,6 +18,10 @@
 #    include "kleidiai/kleidiai.h"
 #endif
+#ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
 #if defined(_WIN32)
 #    define WIN32_LEAN_AND_MEAN
 #    ifndef NOMINMAX
@@ -45,6 +49,12 @@ std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buf
         }
 #endif
+#ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
+        if (wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
+            bufts.push_back(wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type());
+        }
+#endif
 #ifdef WSP_GGML_USE_CPU_KLEIDIAI
         if (wsp_ggml_backend_cpu_kleidiai_buffer_type()) {
             bufts.push_back(wsp_ggml_backend_cpu_kleidiai_buffer_type());
@@ -190,6 +200,7 @@ static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
     /* .graph_compute           = */ wsp_ggml_backend_cpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
@@ -348,8 +359,10 @@ static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, s
     long pages = sysconf(_SC_PHYS_PAGES);
     long page_size = sysconf(_SC_PAGE_SIZE);
     *total = pages * page_size;
+    // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
     *free = *total;
-#endif
+#endif // _WIN32
     WSP_GGML_UNUSED(dev);
 }
@@ -576,9 +589,6 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
         if (wsp_ggml_cpu_has_vxe()) {
             features.push_back({ "VXE", "1" });
         }
-        if (wsp_ggml_cpu_has_nnpa()) {
-            features.push_back({ "NNPA", "1" });
-        }
         if (wsp_ggml_cpu_has_wasm_simd()) {
             features.push_back({ "WASM_SIMD", "1" });
         }