npm - whisper.rn - Versions diffs - 0.5.1 → 0.5.3 - Mend

whisper.rn 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/android/src/main/jni.cpp CHANGED Viewed

@@ -276,6 +276,7 @@ JNIEXPORT jlong JNICALL
 Java_com_rnwhisper_WhisperContext_initContextWithAsset(
     JNIEnv *env,
     jobject thiz,
+    jint context_id,
     jobject asset_manager,
     jstring model_path_str
 ) {
@@ -290,6 +291,7 @@ Java_com_rnwhisper_WhisperContext_initContextWithAsset(
     const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
     context = whisper_init_from_asset(env, asset_manager, model_path_chars, cparams);
     env->ReleaseStringUTFChars(model_path_str, model_path_chars);
+    rnwhisper_jsi::addContext(context_id, reinterpret_cast<jlong>(context));
     return reinterpret_cast<jlong>(context);
 }
@@ -297,6 +299,7 @@ JNIEXPORT jlong JNICALL
 Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
     JNIEnv *env,
     jobject thiz,
+    jint context_id,
     jobject input_stream
 ) {
     UNUSED(thiz);
@@ -308,6 +311,7 @@ Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
     struct whisper_context *context = nullptr;
     context = whisper_init_from_input_stream(env, input_stream, cparams);
+    rnwhisper_jsi::addContext(context_id, reinterpret_cast<jlong>(context));
     return reinterpret_cast<jlong>(context);
 }
@@ -421,8 +425,9 @@ Java_com_rnwhisper_WhisperContext_fullWithNewJob(
     LOGI("About to reset timings");
     whisper_reset_timings(context);
-    LOGI("About to run whisper_full");
-    int code = whisper_full(context, params, audio_data_arr, audio_data_len);
+    int n_processors = readablemap::getInt(env, options, "nProcessors", 1);
+    LOGI("About to run whisper_full_parallel with n_processors=%d", n_processors);
+    int code = whisper_full_parallel(context, params, audio_data_arr, audio_data_len, n_processors);
     if (code == 0) {
         // whisper_print_timings(context);
     }
@@ -441,8 +446,11 @@ Java_com_rnwhisper_WhisperContext_createRealtimeTranscribeJob(
     jlong context_ptr,
     jobject options
 ) {
+    UNUSED(thiz);
+    UNUSED(context_ptr);
     whisper_full_params params = createFullParams(env, options);
     rnwhisper::job* job = rnwhisper::job_new(job_id, params);
+    job->n_processors = readablemap::getInt(env, options, "nProcessors", 1);
     rnwhisper::vad_params vad;
     vad.use_vad = readablemap::getBool(env, options, "useVad", false);
     vad.vad_ms = readablemap::getInt(env, options, "vadMs", 2000);
@@ -534,11 +542,12 @@ Java_com_rnwhisper_WhisperContext_fullWithJob(
     jint n_samples
 ) {
     UNUSED(thiz);
+    UNUSED(env);
     struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
     rnwhisper::job* job = rnwhisper::job_get(job_id);
     float* pcmf32 = job->pcm_slice_to_f32(slice_index, n_samples);
-    int code = whisper_full(context, job->params, pcmf32, n_samples);
+    int code = whisper_full_parallel(context, job->params, pcmf32, n_samples, job->n_processors);
     free(pcmf32);
     if (code == 0) {
         // whisper_print_timings(context);

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -226,16 +226,23 @@ static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tall
     }
     if (best_fit_block == -1) {
-        // no suitable block found, try the last block (this will grow a chunks size)
+        // no suitable block found, try the last block (this may grow a chunks size)
+        int64_t best_reuse = INT64_MIN;
         for (int c = 0; c < alloc->n_chunks; ++c) {
             struct tallocr_chunk * chunk = alloc->chunks[c];
             if (chunk->n_free_blocks > 0) {
                 struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
                 max_avail = MAX(max_avail, block->size);
-                if (block->size >= size) {
+                int64_t reuse_factor = chunk->max_size - block->offset - size;
+                // reuse_factor < 0 : amount of extra memory that needs to be allocated
+                // reuse_factor = 0 : allocated free space exactly matches tensor size
+                // reuse_factor > 0 : superfluous memory that will remain unused
+                bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
+                bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
+                if (block->size >= size && (better_reuse || better_fit)) {
                     best_fit_chunk = c;
                     best_fit_block = chunk->n_free_blocks - 1;
-                    break;
+                    best_reuse = reuse_factor;
                 }
             }
         }
@@ -268,7 +275,7 @@ static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tall
 #ifdef WSP_GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, addr, tensor);
     size_t cur_max = addr.offset + size;
-    if (cur_max > alloc->max_size[addr.chunk]) {
+    if (cur_max > chunk->max_size) {
         // sort allocated_tensors by chunk/offset
         for (int i = 0; i < 1024; i++) {
             for (int j = i + 1; j < 1024; j++) {
@@ -392,12 +399,8 @@ static void wsp_ggml_dyn_tallocr_free(struct wsp_ggml_dyn_tallocr * alloc) {
     free(alloc);
 }
-static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc) {
-    size_t max_size = 0;
-    for (int i = 0; i < alloc->n_chunks; i++) {
-        max_size += alloc->chunks[i]->max_size;
-    }
-    return max_size;
+static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc, int chunk) {
+    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
 }
@@ -417,10 +420,8 @@ static void wsp_ggml_vbuffer_free(struct vbuffer * buf) {
     free(buf);
 }
-static int wsp_ggml_vbuffer_n_chunks(struct vbuffer * buf) {
-    int n = 0;
-    while (n < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
-    return n;
+static size_t wsp_ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
+    return buf->chunks[chunk] ? wsp_ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
 }
 static size_t wsp_ggml_vbuffer_size(struct vbuffer * buf) {
@@ -604,6 +605,26 @@ static bool wsp_ggml_gallocr_is_allocated(wsp_ggml_gallocr_t galloc, struct wsp_
     return t->data != NULL || wsp_ggml_gallocr_hash_get(galloc, t)->allocated;
 }
+// free the extra space at the end if the new tensor is smaller
+static void wsp_ggml_gallocr_free_extra_space(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, struct wsp_ggml_tensor * parent) {
+    struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
+    struct hash_node * p_hn = wsp_ggml_gallocr_hash_get(galloc, parent);
+    size_t parent_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
+    size_t node_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+    WSP_GGML_ASSERT(parent_size >= node_size);
+    if (parent_size > node_size) {
+        struct wsp_ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+        struct buffer_address p_addr = p_hn->addr;
+        p_addr.offset += node_size;
+        size_t extra_size = parent_size - node_size;
+        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
+        wsp_ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+    }
+}
 static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, int buffer_id) {
     WSP_GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
@@ -649,6 +670,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
                             hn->addr = p_hn->addr;
                             p_hn->allocated = false; // avoid freeing the parent
                             view_src_hn->allocated = false;
+                            wsp_ggml_gallocr_free_extra_space(galloc, node, view_src);
                             return;
                         }
                     } else {
@@ -656,6 +678,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
                         hn->buffer_id = p_hn->buffer_id;
                         hn->addr = p_hn->addr;
                         p_hn->allocated = false; // avoid freeing the parent
+                        wsp_ggml_gallocr_free_extra_space(galloc, node, parent);
                         return;
                     }
                 }
@@ -885,12 +908,20 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
             }
         }
-        size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
-        size_t new_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        if (new_size > cur_size || galloc->buffers[i] == NULL) {
+        bool realloc = galloc->buffers[i] == NULL;
+        size_t new_size = 0;
+        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
+            size_t cur_chunk_size = galloc->buffers[i] ? wsp_ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
+            size_t new_chunk_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
+            new_size += new_chunk_size;
+            if (new_chunk_size > cur_chunk_size) {
+                realloc = true;
+            }
+        }
+        if (realloc) {
 #ifndef NDEBUG
+            size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
             WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

package/cpp/ggml-backend-impl.h CHANGED Viewed

@@ -209,9 +209,6 @@ extern "C" {
         void * context;
     };
-    // Internal backend registry API
-    WSP_GGML_API void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg);
     // Add backend dynamic loading support to the backend
     // Initialize the backend

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -57,6 +57,10 @@
 #include "ggml-opencl.h"
 #endif
+#ifdef WSP_GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
 #ifdef WSP_GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -199,6 +203,9 @@ struct wsp_ggml_backend_registry {
 #ifdef WSP_GGML_USE_OPENCL
         register_backend(wsp_ggml_backend_opencl_reg());
 #endif
+#ifdef WSP_GGML_USE_HEXAGON
+        register_backend(wsp_ggml_backend_hexagon_reg());
+#endif
 #ifdef WSP_GGML_USE_CANN
         register_backend(wsp_ggml_backend_cann_reg());
 #endif
@@ -598,6 +605,7 @@ void wsp_ggml_backend_load_all_from_path(const char * dir_path) {
     wsp_ggml_backend_load_best("sycl", silent, dir_path);
     wsp_ggml_backend_load_best("vulkan", silent, dir_path);
     wsp_ggml_backend_load_best("opencl", silent, dir_path);
+    wsp_ggml_backend_load_best("hexagon", silent, dir_path);
     wsp_ggml_backend_load_best("musa", silent, dir_path);
     wsp_ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable WSP_GGML_BACKEND_PATH to load an out-of-tree backend

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -1698,8 +1698,6 @@ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_g
     WSP_GGML_ASSERT(sched);
     WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
-    wsp_ggml_backend_sched_reset(sched);
     wsp_ggml_backend_sched_synchronize(sched);
     wsp_ggml_backend_sched_split_graph(sched, measure_graph);

package/cpp/ggml-backend.h CHANGED Viewed

@@ -215,6 +215,8 @@ extern "C" {
     // Backend registry
     //
+    WSP_GGML_API void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg);
     WSP_GGML_API void wsp_ggml_backend_device_register(wsp_ggml_backend_dev_t device);
     // Backend (reg) enumeration

package/cpp/ggml-cpu/amx/amx.cpp CHANGED Viewed

@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
         if (op->op == WSP_GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
             is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
             op->src[0]->buffer && op->src[0]->buffer->buft == wsp_ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
             op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
             (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == WSP_GGML_TYPE_F16))) {
             // src1 must be host buffer