npm - @fugood/llama.node - Versions diffs - 1.4.7 → 1.4.8 - Mend

@fugood/llama.node 1.4.7 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

package/lib/binding.ts +8 -0
package/package.json +15 -15
package/scripts/llama.cpp.patch +22 -23
package/src/LlamaContext.cpp +2 -2
package/src/llama.cpp/common/CMakeLists.txt +2 -0
package/src/llama.cpp/common/arg.cpp +364 -193
package/src/llama.cpp/common/arg.h +43 -2
package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
package/src/llama.cpp/common/chat.cpp +140 -0
package/src/llama.cpp/common/common.cpp +130 -67
package/src/llama.cpp/common/common.h +40 -16
package/src/llama.cpp/common/console.cpp +98 -18
package/src/llama.cpp/common/console.h +30 -8
package/src/llama.cpp/common/download.cpp +69 -25
package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
package/src/llama.cpp/common/log.cpp +5 -0
package/src/llama.cpp/common/log.h +1 -0
package/src/llama.cpp/common/peg-parser.cpp +1 -1
package/src/llama.cpp/common/preset.cpp +206 -0
package/src/llama.cpp/common/preset.h +32 -0
package/src/llama.cpp/common/sampling.cpp +91 -92
package/src/llama.cpp/common/sampling.h +11 -6
package/src/llama.cpp/common/speculative.cpp +1 -1
package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +7 -8
package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
package/src/llama.cpp/include/llama.h +18 -1
package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
package/src/llama.cpp/src/llama-arch.h +9 -2
package/src/llama.cpp/src/llama-batch.cpp +12 -2
package/src/llama.cpp/src/llama-batch.h +4 -2
package/src/llama.cpp/src/llama-context.cpp +93 -23
package/src/llama.cpp/src/llama-context.h +8 -2
package/src/llama.cpp/src/llama-graph.cpp +84 -16
package/src/llama.cpp/src/llama-graph.h +17 -4
package/src/llama.cpp/src/llama-hparams.cpp +6 -0
package/src/llama.cpp/src/llama-hparams.h +5 -1
package/src/llama.cpp/src/llama-impl.cpp +4 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
package/src/llama.cpp/src/llama-kv-cache.h +19 -2
package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
package/src/llama.cpp/src/llama-model-loader.h +2 -0
package/src/llama.cpp/src/llama-model.cpp +103 -44
package/src/llama.cpp/src/llama-model.h +1 -0
package/src/llama.cpp/src/llama-quant.cpp +1 -1
package/src/llama.cpp/src/llama-vocab.cpp +2 -1
package/src/llama.cpp/src/llama.cpp +675 -1
package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
package/src/llama.cpp/src/models/glm4.cpp +27 -4
package/src/llama.cpp/src/models/models.h +5 -5
package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
package/src/llama.cpp/src/models/qwen2.cpp +12 -3
package/src/llama.cpp/src/models/qwen3next.cpp +81 -266

package/src/llama.cpp/common/sampling.h CHANGED Viewed

@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // extended sampling implementation:
 //
 // - set logits
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
 // generalized version of common_sampler_sample
 //
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
                 const char * grammar_kind, const char * grammar_data);
+struct common_sampler_deleter {
+    void operator()(common_sampler * s) { common_sampler_free(s); }
+};
+typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;

package/src/llama.cpp/common/speculative.cpp CHANGED Viewed

@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
     for (int i = 0; i < params.n_draft; ++i) {
         common_batch_clear(batch);
-        common_sampler_sample(smpl, ctx_dft, 0, true);
+        common_sampler_sample(smpl, ctx_dft, 0);
         const auto * cur_p = common_sampler_get_candidates(smpl, true);

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     # TODO
 else()
     set(GGML_STANDALONE OFF)
+    if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+    endif()
 endif()
 if (EMSCRIPTEN)

package/src/llama.cpp/ggml/include/ggml-alloc.h CHANGED Viewed

@@ -53,7 +53,14 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
+// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API void ggml_gallocr_reserve_n_size(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids,
+    size_t * sizes);
 GGML_API bool ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
+// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
+GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

package/src/llama.cpp/ggml/include/ggml-backend.h CHANGED Viewed

@@ -307,6 +307,7 @@ extern "C" {
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
+    GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
     GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

package/src/llama.cpp/ggml/include/ggml-cpu.h CHANGED Viewed

@@ -99,6 +99,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_sme        (void);
     // other
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -2305,13 +2305,11 @@ extern "C" {
             float                 stop,
             float                 step);
-#define GGML_KQ_MASK_PAD 1
-    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
-    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
-    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
-    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
+    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch, ne32,      ne33]
+    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
     //
     // broadcast:
     //   n_head % n_head_kv == 0
@@ -2617,7 +2615,8 @@ extern "C" {
     // Set callback for all future logging events.
     // If this is not called, or NULL is supplied, everything is output on stderr.
-    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
+    GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
+    GGML_API void ggml_log_set(ggml_log_callback   log_callback, void *  user_data);
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
             ggml_add_cpu_backend_variant(android_armv8.2_1    DOTPROD)
             ggml_add_cpu_backend_variant(android_armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
             ggml_add_cpu_backend_variant(android_armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
+            ggml_add_cpu_backend_variant(android_armv9.0_1    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
+            ggml_add_cpu_backend_variant(android_armv9.2_1    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
+            ggml_add_cpu_backend_variant(android_armv9.2_2    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
         elseif (APPLE)
             ggml_add_cpu_backend_variant(apple_m1             DOTPROD)
             ggml_add_cpu_backend_variant(apple_m2_m3          DOTPROD MATMUL_INT8)

package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp CHANGED Viewed

@@ -24,6 +24,7 @@
 #define UNUSED GGML_UNUSED
+#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
 static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
                                              int16x8_t *     out_mins,
                                              int8_t *        out_scales) {
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
     scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
     memcpy(out_scales, scales_u32, 8);
 }
+#endif
 void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK8_0 == 32);

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif
+#if defined(__riscv)
+struct ggml_riscv_arch_features_type {
+    int rvv_vlen;
+} ggml_riscv_arch_features = { 0 };
+#endif
 #if defined(_WIN32)
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
 typedef pthread_t ggml_thread_t;
+#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
+#define GGML_THREADPOOL_N_THREADS_BITS (16)
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
@@ -449,7 +457,7 @@ struct ggml_threadpool {
     struct ggml_cplan  * cplan;
     // synchronization primitives
-    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
+    atomic_int n_graph;       // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
     atomic_int GGML_CACHE_ALIGN n_barrier;
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -457,12 +465,10 @@ struct ggml_threadpool {
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_int abort;         // Used for aborting processing of a graph
+    atomic_int  abort;        // Used for aborting processing of a graph
     struct ggml_compute_state * workers;   // per thread state
-    int          n_threads_max; // number of threads in the pool
-    atomic_int   n_threads_cur; // number of threads used in the current graph
+    int          n_threads;   // Number of threads in the pool
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
@@ -539,7 +545,7 @@ struct ggml_state {
 static struct ggml_state g_state = {0};
 void ggml_barrier(struct ggml_threadpool * tp) {
-    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
     if (n_threads == 1) {
         return;
     }
@@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
         // last thread
         atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-        // exit barrier (fill seq-cst fence)
+        // exit barrier (full seq-cst fence)
         atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
         return;
     }
@@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+#include <riscv_vector.h>
+static void ggml_init_riscv_arch_features(void) {
+    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
+}
+#else
+static void ggml_init_riscv_arch_features(void) {}
+#endif
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
@@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
 void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
     if (!threadpool) return;
-    const int n_threads = threadpool->n_threads_max;
+    const int n_threads = threadpool->n_threads;
 #ifndef GGML_USE_OPENMP
     struct ggml_compute_state* workers = threadpool->workers;
@@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
         //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
     }
     if (n_threads <= 0) {
-        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
     }
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_params params = {
         /*.ith       =*/ state->ith,
-        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
+        /*.nth       =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
         /*.wsize     =*/ cplan->work_size,
         /*.wdata     =*/ cplan->work_data,
         /*.threadpool=*/ tp,
     };
+    GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
+    GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
     ggml_barrier(state->threadpool);
     return 0;
@@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #ifndef GGML_USE_OPENMP
-// check if thread is active
-static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    return (state->ith < n_threads);
-}
 // check if thread is ready to proceed (exit from polling or sleeping)
+// returns true if loops should exit, sets state->pending to indicate new work
 static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
     struct ggml_threadpool * threadpool = state->threadpool;
     if (state->pending || threadpool->stop || threadpool->pause) { return true; }
     // check for new graph/work
-    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
-    if (new_graph != state->last_graph) {
-        state->pending    = ggml_graph_compute_thread_active(state);
-        state->last_graph = new_graph;
+    int n_graph   = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
+    if (n_graph != state->last_graph) {
+        state->pending    = (state->ith < n_threads);
+        state->last_graph = n_graph;
+        return true;
     }
-    return state->pending;
+    return false;
 }
 // sync thread state after polling
@@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
     struct ggml_threadpool * threadpool = state->threadpool;
-    // Skip polling for unused threads
-    if (!ggml_graph_compute_thread_active(state)) {
-        return state->pending;
-    }
     // This seems to make 0 ... 100 a decent range for polling level across modern processors.
     // Perhaps, we can adjust it dynamically based on load and things.
     const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
         ggml_graph_compute_check_for_work(state);
         if (state->pending) {
             state->pending = false;
             ggml_graph_compute_thread(state);
         }
     }
@@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
     ggml_mutex_lock(&threadpool->mutex);
-    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
+    // Update the number of active threads and the graph count
+    int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
+    n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
-    // Update the number of active threads
-    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+    GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
     // Indicate the graph is ready to be processed
     // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
-    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
+    atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
     if (threadpool->pause) {
        // Update main thread prio and affinity to match the threadpool settings
@@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->pause            = tpp->paused;
         threadpool->abort            = -1;
         threadpool->workers          = NULL;
-        threadpool->n_threads_max    = tpp->n_threads;
-        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->n_threads        = tpp->n_threads;
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
         threadpool->ec               = GGML_STATUS_SUCCESS;
@@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+                atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
             }
             // Apply thread CPU mask and priority
@@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             ggml_graph_compute_thread(&threadpool->workers[ith]);
         }
     } else {
-        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
+        atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
         ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else
-    if (n_threads > threadpool->n_threads_max) {
-        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
-        n_threads = threadpool->n_threads_max;
+    if (n_threads > threadpool->n_threads) {
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
+        n_threads = threadpool->n_threads;
     }
     // Kick all threads to start the new graph
@@ -3464,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }
+int ggml_cpu_get_rvv_vlen(void) {
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+    return ggml_riscv_arch_features.rvv_vlen;
+#else
+    return 0;
+#endif
+}
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
     return 1;
@@ -3630,6 +3647,10 @@ void ggml_cpu_init(void) {
         ggml_init_arm_arch_features();
 #endif
+#if defined(__riscv)
+        ggml_init_riscv_arch_features();
+#endif
         is_first_call = false;
     }

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED Viewed

@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_riscv_v()) {
             features.push_back({ "RISCV_V", "1" });
         }
+        if (ggml_cpu_get_rvv_vlen() > 0) {
+            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
+            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
+        }
         if (ggml_cpu_has_vsx()) {
             features.push_back({ "VSX", "1" });
         }

package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp CHANGED Viewed

@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
     static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
     if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
+            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
             if (cur->ne[1] % 8 == 0) {
                 return &q4_0_8x8_q8_0;
             }

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -313,6 +313,7 @@ extern "C" {
         bool check_tensors;   // validate model tensor data
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
         bool no_host;         // bypass host buffer allowing extra buffers to be used
+        bool no_alloc;        // only load metadata and simulate memory allocations
     };
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -466,10 +467,24 @@ extern "C" {
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
+    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
+    // returns true if the parameters could be successfully modified to fit device memory
+    // this function is NOT thread safe because it modifies the global llama logger state
+    LLAMA_API bool llama_params_fit(
+                                   const char   * path_model,
+                    struct llama_model_params   * mparams,
+                    struct llama_context_params * cparams,
+                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                                         size_t   margin,                // margin of memory to leave per device in bytes
+                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
     LLAMA_API int64_t llama_time_us(void);
     LLAMA_API size_t llama_max_devices(void);
     LLAMA_API size_t llama_max_parallel_sequences(void);
+    LLAMA_API size_t llama_max_tensor_buft_overrides(void);
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
@@ -1354,7 +1369,9 @@ extern "C" {
     // Set callback for all future logging events.
     // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
+    // The logger state is global so these functions are NOT thread safe.
+    LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
+    LLAMA_API void llama_log_set(ggml_log_callback   log_callback, void *  user_data);
     //
     // Performance utils