npm - whisper.rn - Versions diffs - 0.5.3 → 0.5.5 - Mend

whisper.rn 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/README.md CHANGED Viewed

@@ -98,7 +98,7 @@ Voice Activity Detection allows you to detect speech segments in audio data usin
 import { initWhisperVad } from 'whisper.rn'
 const vadContext = await initWhisperVad({
-  filePath: require('./assets/ggml-silero-v5.1.2.bin'), // VAD model file
+  filePath: require('./assets/ggml-silero-v6.2.0.bin'), // VAD model file
   useGpu: true, // Use GPU acceleration (iOS only)
   nThreads: 4, // Number of threads for processing
 })

package/android/src/main/java/com/rnwhisper/WhisperContext.java CHANGED Viewed

@@ -425,6 +425,10 @@ public class WhisperContext {
     }
     data.putString("result", builder.toString());
     data.putArray("segments", segments);
+    String language = getDetectedLanguage(context);
+    if (language != null) {
+      data.putString("language", language);
+    }
     return data;
   }
@@ -556,6 +560,7 @@ public class WhisperContext {
   protected static native int getTextSegmentT0(long context, int index);
   protected static native int getTextSegmentT1(long context, int index);
   protected static native boolean getTextSegmentSpeakerTurnNext(long context, int index);
+  protected static native String getDetectedLanguage(long context);
   protected static native void createRealtimeTranscribeJob(
     int job_id,

package/android/src/main/jni.cpp CHANGED Viewed

@@ -632,6 +632,19 @@ Java_com_rnwhisper_WhisperContext_getTextSegmentSpeakerTurnNext(
     return whisper_full_get_segment_speaker_turn_next(context, index);
 }
+JNIEXPORT jstring JNICALL
+Java_com_rnwhisper_WhisperContext_getDetectedLanguage(
+        JNIEnv *env, jobject thiz, jlong context_ptr) {
+    UNUSED(thiz);
+    struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
+    int lang_id = whisper_full_lang_id(context);
+    const char *lang_str = whisper_lang_str(lang_id);
+    if (lang_str == nullptr) {
+        return nullptr;
+    }
+    return env->NewStringUTF(lang_str);
+}
 JNIEXPORT jstring JNICALL
 Java_com_rnwhisper_WhisperContext_bench(
     JNIEnv *env,

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -25,6 +25,7 @@ static bool wsp_ggml_is_view(const struct wsp_ggml_tensor * t) {
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
     switch (op) {
+        case WSP_GGML_OP_FILL:
         case WSP_GGML_OP_SCALE:
         case WSP_GGML_OP_DIAG_MASK_ZERO:
         case WSP_GGML_OP_DIAG_MASK_INF:
@@ -311,16 +312,9 @@ static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tall
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct wsp_ggml_tensor * tensor) {
+static void wsp_ggml_dyn_tallocr_free_bytes(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
     size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
-#ifdef WSP_GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, addr, tensor);
-#endif
     struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
     // see if we can merge with an existing block
@@ -356,8 +350,6 @@ static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc
     }
     // otherwise, add a new block
     wsp_ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
-    WSP_GGML_UNUSED(tensor);
 }
 static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
@@ -602,7 +594,9 @@ static bool wsp_ggml_gallocr_is_own(wsp_ggml_gallocr_t galloc, struct wsp_ggml_t
 }
 static bool wsp_ggml_gallocr_is_allocated(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * t) {
-    return t->data != NULL || wsp_ggml_gallocr_hash_get(galloc, t)->allocated;
+    return t->data != NULL // tensor data already set externally
+        || t->buffer // tensor on external buffer (but not yet allocated)
+        || wsp_ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
 }
 // free the extra space at the end if the new tensor is smaller
@@ -615,13 +609,17 @@ static void wsp_ggml_gallocr_free_extra_space(wsp_ggml_gallocr_t galloc, struct
     WSP_GGML_ASSERT(parent_size >= node_size);
+    // note: we want after the freeing the chunks to continue to be aligned
+    struct wsp_ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
+    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
     if (parent_size > node_size) {
-        struct wsp_ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
         struct buffer_address p_addr = p_hn->addr;
         p_addr.offset += node_size;
         size_t extra_size = parent_size - node_size;
         AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        wsp_ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+        wsp_ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
     }
 }
@@ -705,7 +703,14 @@ static void wsp_ggml_gallocr_free_node(wsp_ggml_gallocr_t galloc, struct wsp_ggm
     struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
     wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
-    wsp_ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
+#ifdef WSP_GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, hn->addr, node);
+#endif
+    wsp_ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
     hn->allocated = false;
 }
@@ -820,7 +825,8 @@ static void wsp_ggml_gallocr_alloc_graph_impl(wsp_ggml_gallocr_t galloc, struct
     }
 }
-bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+static bool wsp_ggml_gallocr_reserve_n_impl(
+        wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -921,15 +927,23 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
         }
         if (realloc) {
 #ifndef NDEBUG
-            size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
-            WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            {
+                size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
+                if (cur_size > 0) {
+                    WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
+                        __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+                }
+            }
 #endif
             wsp_ggml_vbuffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = wsp_ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-            if (galloc->buffers[i] == NULL) {
-                WSP_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+            if (no_alloc) {
+                galloc->buffers[i] = NULL;
+            } else {
+                galloc->buffers[i] = wsp_ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+                if (galloc->buffers[i] == NULL) {
+                    WSP_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                    return false;
+                }
             }
         }
     }
@@ -937,6 +951,21 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
     return true;
 }
+void wsp_ggml_gallocr_reserve_n_size(
+        wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
+    WSP_GGML_ASSERT(wsp_ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        sizes[i] = 0;
+        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
+            sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
+        }
+    }
+}
+bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    return wsp_ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
+}
 bool wsp_ggml_gallocr_reserve(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph *graph) {
     return wsp_ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
@@ -1139,7 +1168,8 @@ static bool alloc_tensor_range(struct wsp_ggml_context * ctx,
     return true;
 }
-wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft) {
+static wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft_impl(
+        struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
     WSP_GGML_ASSERT(wsp_ggml_get_no_alloc(ctx) == true);
     size_t alignment = wsp_ggml_backend_buft_get_alignment(buft);
@@ -1147,6 +1177,7 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
     wsp_ggml_backend_buffer_t * buffers = NULL;
     size_t n_buffers = 0;
+    *nbytes_total = 0;
     size_t cur_buf_size = 0;
     struct wsp_ggml_tensor * first = wsp_ggml_get_first_tensor(ctx);
@@ -1158,10 +1189,11 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
         if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
-            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+            if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                 return NULL;
             }
             first = t;
+            *nbytes_total += cur_buf_size;
             cur_buf_size = this_size;
         } else {
             cur_buf_size += this_size;
@@ -1170,15 +1202,21 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
     // allocate remaining tensors
     if (cur_buf_size > 0) {
-        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+        *nbytes_total += cur_buf_size;
+        if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
             return NULL;
         }
     }
+    if (no_alloc) {
+        return NULL;
+    }
     if (n_buffers == 0) {
 #ifndef NDEBUG
         WSP_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
+        WSP_GGML_ASSERT(!buffers);
         return NULL;
     }
@@ -1188,10 +1226,24 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct ws
     } else {
         buffer = wsp_ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
     }
-    free(buffers);
+    if (buffers) {
+        free(buffers); // can be NULL if context is empty or no_alloc
+    }
     return buffer;
 }
+size_t wsp_ggml_backend_alloc_ctx_tensors_from_buft_size(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    wsp_ggml_backend_buffer_t buf = wsp_ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
+    WSP_GGML_ASSERT(!buf);
+    return nbytes_total;
+}
+wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    return wsp_ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
+}
 wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors(struct wsp_ggml_context * ctx, wsp_ggml_backend_t backend) {
     return wsp_ggml_backend_alloc_ctx_tensors_from_buft(ctx, wsp_ggml_backend_get_default_buffer_type(backend));
 }

package/cpp/ggml-alloc.h CHANGED Viewed

@@ -53,7 +53,14 @@ WSP_GGML_API void           wsp_ggml_gallocr_free(wsp_ggml_gallocr_t galloc);
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: wsp_ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
+// wsp_ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by wsp_ggml_gallocr_reserve_n to sizes
 WSP_GGML_API bool wsp_ggml_gallocr_reserve(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph);
+WSP_GGML_API void wsp_ggml_gallocr_reserve_n_size(
+    wsp_ggml_gallocr_t galloc,
+    struct wsp_ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids,
+    size_t * sizes);
 WSP_GGML_API bool wsp_ggml_gallocr_reserve_n(
     wsp_ggml_gallocr_t galloc,
     struct wsp_ggml_cgraph * graph,
@@ -68,6 +75,8 @@ WSP_GGML_API size_t wsp_ggml_gallocr_get_buffer_size(wsp_ggml_gallocr_t galloc,
 // Utils
 // Create a buffer and allocate all the tensors in a wsp_ggml_context
+// wsp_ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by wsp_ggml_backend_alloc_ctx_tensors_from_buft
+WSP_GGML_API size_t                       wsp_ggml_backend_alloc_ctx_tensors_from_buft_size(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft);
 WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft);
 WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_backend_alloc_ctx_tensors(struct wsp_ggml_context * ctx, wsp_ggml_backend_t backend);

package/cpp/ggml-backend-impl.h CHANGED Viewed

@@ -144,7 +144,7 @@ extern "C" {
         // device description: short informative description of the device, could be the model name
         const char * (*get_description)(wsp_ggml_backend_dev_t dev);
-        // device memory in bytes
+        // device memory in bytes: 0 bytes to indicate no memory to report
         void         (*get_memory)(wsp_ggml_backend_dev_t dev, size_t * free, size_t * total);
         // device type

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -73,6 +73,10 @@
 #include "ggml-cann.h"
 #endif
+#ifdef WSP_GGML_USE_ZENDNN
+#include "ggml-zendnn.h"
+#endif
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -203,6 +207,9 @@ struct wsp_ggml_backend_registry {
 #ifdef WSP_GGML_USE_OPENCL
         register_backend(wsp_ggml_backend_opencl_reg());
 #endif
+#ifdef WSP_GGML_USE_ZENDNN
+        register_backend(wsp_ggml_backend_zendnn_reg());
+#endif
 #ifdef WSP_GGML_USE_HEXAGON
         register_backend(wsp_ggml_backend_hexagon_reg());
 #endif
@@ -534,8 +541,12 @@ static wsp_ggml_backend_reg_t wsp_ggml_backend_load_best(const char * name, bool
     fs::path best_path;
     for (const auto & search_path : search_paths) {
-        if (!fs::exists(search_path)) {
-            WSP_GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
+        if (std::error_code ec; !fs::exists(search_path, ec)) {
+            if (ec) {
+                WSP_GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
+            } else {
+                WSP_GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
+            }
             continue;
         }
         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
@@ -575,8 +586,12 @@ static wsp_ggml_backend_reg_t wsp_ggml_backend_load_best(const char * name, bool
         for (const auto & search_path : search_paths) {
             fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
             fs::path path = search_path / filename;
-            if (fs::exists(path)) {
+            if (std::error_code ec; fs::exists(path, ec)) {
                 return get_reg().load_backend(path, silent);
+            } else {
+                if (ec) {
+                    WSP_GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
+                }
             }
         }
         return nullptr;
@@ -597,6 +612,7 @@ void wsp_ggml_backend_load_all_from_path(const char * dir_path) {
 #endif
     wsp_ggml_backend_load_best("blas", silent, dir_path);
+    wsp_ggml_backend_load_best("zendnn", silent, dir_path);
     wsp_ggml_backend_load_best("cann", silent, dir_path);
     wsp_ggml_backend_load_best("cuda", silent, dir_path);
     wsp_ggml_backend_load_best("hip", silent, dir_path);

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -36,12 +36,11 @@ const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
 }
 wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
+    WSP_GGML_ASSERT(buft);
     if (size == 0) {
         // return a dummy buffer for zero-sized allocations
         return wsp_ggml_backend_buffer_init(buft, {}, NULL, 0);
     }
-    WSP_GGML_ASSERT(buft);
     return buft->iface.alloc_buffer(buft, size);
 }
@@ -128,6 +127,12 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
         return NULL;
     }
+    // FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
+    //     I don't know whether the above comment is correct
+    if (!buffer->iface.get_base) {
+        return NULL;
+    }
     void * base = buffer->iface.get_base(buffer);
     WSP_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -723,6 +728,12 @@ struct wsp_ggml_backend_sched {
     bool op_offload;
     int debug;
+    // used for debugging graph reallocations [WSP_GGML_SCHED_DEBUG_REALLOC]
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17617
+    int debug_realloc;
+    int debug_graph_size;
+    int debug_prev_graph_size;
 };
 #define hash_id(tensor) wsp_ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1234,10 +1245,8 @@ void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct w
                                 tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
                                 wsp_ggml_format_name(tensor_copy, "%s#%s#%d", wsp_ggml_backend_name(backend), src->name, c);
                             }
-                            if (sched->n_copies > 1) {
-                                wsp_ggml_set_input(tensor_copy);
-                                wsp_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
-                            }
+                            wsp_ggml_set_input(tensor_copy);
+                            wsp_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
                             tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
                             SET_CAUSE(tensor_copy, "4.cpy");
                         }
@@ -1289,6 +1298,11 @@ void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct w
     }
     int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*WSP_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
+    // remember the actual graph_size for performing reallocation checks later [WSP_GGML_SCHED_DEBUG_REALLOC]
+    sched->debug_prev_graph_size = sched->debug_graph_size;
+    sched->debug_graph_size = graph_size;
     if (sched->graph.size < graph_size) {
         sched->graph.size = graph_size;
         sched->graph.nodes = (wsp_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct wsp_ggml_tensor *));
@@ -1395,14 +1409,27 @@ static bool wsp_ggml_backend_sched_alloc_splits(wsp_ggml_backend_sched_t sched)
     // allocate graph
     if (backend_ids_changed || !wsp_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+#ifndef NDEBUG
+        WSP_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+#endif
+        if (sched->debug_realloc > 0) {
+            // we are interested only in situations where the graph was reallocated even though its size remained the same [WSP_GGML_SCHED_DEBUG_REALLOC]
+            // example: https://github.com/ggml-org/llama.cpp/pull/17143
+            const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
+            if (unexpected || sched->debug_realloc > 1) {
+                WSP_GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
+                        sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
+            }
+        }
         // the re-allocation may cause the split inputs to be moved to a different address
         // synchronize without wsp_ggml_backend_sched_synchronize to avoid changing cur_copy
         for (int i = 0; i < sched->n_backends; i++) {
             wsp_ggml_backend_synchronize(sched->backends[i]);
         }
-#ifndef NDEBUG
-        WSP_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
-#endif
         wsp_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
         if (!wsp_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
             WSP_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
@@ -1614,6 +1641,14 @@ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(
     const char * WSP_GGML_SCHED_DEBUG = getenv("WSP_GGML_SCHED_DEBUG");
     sched->debug = WSP_GGML_SCHED_DEBUG ? atoi(WSP_GGML_SCHED_DEBUG) : 0;
+    sched->debug_realloc = 0;
+#ifdef WSP_GGML_SCHED_NO_REALLOC
+    sched->debug_realloc = 1;
+#endif
+    const char * WSP_GGML_SCHED_DEBUG_REALLOC = getenv("WSP_GGML_SCHED_DEBUG_REALLOC");
+    sched->debug_realloc = WSP_GGML_SCHED_DEBUG_REALLOC ? atoi(WSP_GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
     sched->n_backends = n_backends;
     sched->n_copies = parallel ? WSP_GGML_SCHED_MAX_COPIES : 1;
@@ -1630,6 +1665,9 @@ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(
     sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
     sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
+    sched->debug_graph_size = 0;
+    sched->debug_prev_graph_size = 0;
     sched->context_buffer_size = wsp_ggml_sched_max_splits*WSP_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct wsp_ggml_tensor) + wsp_ggml_graph_overhead_custom(graph_size, false);
     sched->context_buffer = (char *) malloc(sched->context_buffer_size);
@@ -1694,6 +1732,20 @@ void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
     sched->is_alloc = false;
 }
+void wsp_ggml_backend_sched_reserve_size(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph, size_t * sizes) {
+    WSP_GGML_ASSERT(sched);
+    WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+    WSP_GGML_ASSERT(sizes);
+    wsp_ggml_backend_sched_reset(sched);
+    wsp_ggml_backend_sched_synchronize(sched);
+    wsp_ggml_backend_sched_split_graph(sched, measure_graph);
+    wsp_ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
+}
 bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
     WSP_GGML_ASSERT(sched);
     WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
@@ -2001,7 +2053,7 @@ void wsp_ggml_backend_graph_copy_free(struct wsp_ggml_backend_graph_copy copy) {
     wsp_ggml_free(copy.ctx_unallocated);
 }
-bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor * test_node) {
+bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
     struct wsp_ggml_backend_graph_copy copy = wsp_ggml_backend_graph_copy(backend2, graph);
     if (copy.buffer == NULL) {
         return false;
@@ -2012,22 +2064,22 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
     assert(g1->n_nodes == g2->n_nodes);
-    if (test_node != nullptr) {
-        // Compute the whole graph and only test the output for a specific tensor
+    if (num_test_nodes != 0) {
+        WSP_GGML_ASSERT(test_nodes);
+        // Compute the whole graph and only test the output for specific tensors
         wsp_ggml_backend_graph_compute(backend1, g1);
         wsp_ggml_backend_graph_compute(backend2, g2);
-        int test_node_idx = -1;
+        bool verified = false;
         for (int i = 0; i < g1->n_nodes; i++) {
-            struct wsp_ggml_tensor * t1 = g1->nodes[i];
-            if (t1 == test_node) {
-                test_node_idx = i;
-                break;
+            for (size_t j = 0; j < num_test_nodes; ++j) {
+                if (g1->nodes[i] == test_nodes[j]) {
+                    callback(i, g1->nodes[i], g2->nodes[i], user_data);
+                    verified = true;
+                }
             }
         }
-        WSP_GGML_ASSERT(test_node_idx != -1);
-        callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
+        WSP_GGML_ASSERT(verified);
     } else {
         for (int i = 0; i < g1->n_nodes; i++) {
             struct wsp_ggml_tensor * t1 = g1->nodes[i];

package/cpp/ggml-backend.h CHANGED Viewed

@@ -307,6 +307,7 @@ extern "C" {
     WSP_GGML_API void                 wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
+    WSP_GGML_API void                 wsp_ggml_backend_sched_reserve_size(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph, size_t * sizes);
     WSP_GGML_API bool                 wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph); // returns success
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_backends(wsp_ggml_backend_sched_t sched);
@@ -357,7 +358,7 @@ extern "C" {
     typedef bool (*wsp_ggml_backend_eval_callback)(int node_index, struct wsp_ggml_tensor * t1, struct wsp_ggml_tensor * t2, void * user_data);
     // Compare the output of two backends
-    WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor * test_node);
+    WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor const * const * test_nodes, size_t num_test_nodes);
     // Tensor initialization
     WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr);

package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp CHANGED Viewed

@@ -8,6 +8,10 @@
 #include <sys/sysctl.h>
 #endif
+#if !defined(HWCAP2_SVE2)
+#define HWCAP2_SVE2 (1 << 1)
+#endif
 #if !defined(HWCAP2_I8MM)
 #define HWCAP2_I8MM (1 << 13)
 #endif