@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
|
|
|
81
81
|
} ggml_arm_arch_features = { 0 };
|
|
82
82
|
#endif
|
|
83
83
|
|
|
84
|
+
#if defined(__riscv)
|
|
85
|
+
struct ggml_riscv_arch_features_type {
|
|
86
|
+
int rvv_vlen;
|
|
87
|
+
} ggml_riscv_arch_features = { 0 };
|
|
88
|
+
#endif
|
|
84
89
|
|
|
85
90
|
#if defined(_WIN32)
|
|
86
91
|
|
|
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
|
|
|
187
192
|
|
|
188
193
|
typedef pthread_t ggml_thread_t;
|
|
189
194
|
|
|
195
|
+
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
|
196
|
+
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
|
197
|
+
|
|
190
198
|
#if defined(__APPLE__)
|
|
191
199
|
#include <unistd.h>
|
|
192
200
|
#include <mach/mach.h>
|
|
@@ -449,7 +457,7 @@ struct ggml_threadpool {
|
|
|
449
457
|
struct ggml_cplan * cplan;
|
|
450
458
|
|
|
451
459
|
// synchronization primitives
|
|
452
|
-
atomic_int n_graph; //
|
|
460
|
+
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
|
453
461
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
|
454
462
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
|
455
463
|
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
@@ -457,12 +465,10 @@ struct ggml_threadpool {
|
|
|
457
465
|
// these are atomic as an annotation for thread-sanitizer
|
|
458
466
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
459
467
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
|
460
|
-
atomic_int
|
|
468
|
+
atomic_int abort; // Used for aborting processing of a graph
|
|
461
469
|
|
|
462
470
|
struct ggml_compute_state * workers; // per thread state
|
|
463
|
-
int
|
|
464
|
-
atomic_int n_threads_cur; // number of threads used in the current graph
|
|
465
|
-
|
|
471
|
+
int n_threads; // Number of threads in the pool
|
|
466
472
|
int32_t prio; // Scheduling priority
|
|
467
473
|
uint32_t poll; // Polling level (0 - no polling)
|
|
468
474
|
|
|
@@ -490,6 +496,15 @@ static inline void ggml_thread_cpu_relax(void) {
|
|
|
490
496
|
static inline void ggml_thread_cpu_relax(void) {
|
|
491
497
|
_mm_pause();
|
|
492
498
|
}
|
|
499
|
+
#elif defined(__riscv)
|
|
500
|
+
static inline void ggml_thread_cpu_relax(void) {
|
|
501
|
+
#ifdef __riscv_zihintpause
|
|
502
|
+
__asm__ __volatile__ ("pause");
|
|
503
|
+
#else
|
|
504
|
+
/* Encoding of the pause instruction */
|
|
505
|
+
__asm__ __volatile__ (".4byte 0x100000F");
|
|
506
|
+
#endif
|
|
507
|
+
}
|
|
493
508
|
#else
|
|
494
509
|
static inline void ggml_thread_cpu_relax(void) {;}
|
|
495
510
|
#endif
|
|
@@ -530,7 +545,7 @@ struct ggml_state {
|
|
|
530
545
|
static struct ggml_state g_state = {0};
|
|
531
546
|
|
|
532
547
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
|
533
|
-
int n_threads = atomic_load_explicit(&tp->
|
|
548
|
+
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
|
534
549
|
if (n_threads == 1) {
|
|
535
550
|
return;
|
|
536
551
|
}
|
|
@@ -547,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|
|
547
562
|
// last thread
|
|
548
563
|
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
|
549
564
|
|
|
550
|
-
// exit barrier (
|
|
565
|
+
// exit barrier (full seq-cst fence)
|
|
551
566
|
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
|
552
567
|
return;
|
|
553
568
|
}
|
|
@@ -693,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
|
|
|
693
708
|
#endif
|
|
694
709
|
#endif // __ARM_ARCH
|
|
695
710
|
|
|
711
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
712
|
+
#include <riscv_vector.h>
|
|
713
|
+
static void ggml_init_riscv_arch_features(void) {
|
|
714
|
+
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
|
715
|
+
}
|
|
716
|
+
#else
|
|
717
|
+
static void ggml_init_riscv_arch_features(void) {}
|
|
718
|
+
#endif
|
|
719
|
+
|
|
696
720
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
|
697
721
|
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
|
698
722
|
|
|
@@ -2619,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|
|
2619
2643
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
|
2620
2644
|
if (!threadpool) return;
|
|
2621
2645
|
|
|
2622
|
-
const int n_threads = threadpool->
|
|
2646
|
+
const int n_threads = threadpool->n_threads;
|
|
2623
2647
|
|
|
2624
2648
|
#ifndef GGML_USE_OPENMP
|
|
2625
2649
|
struct ggml_compute_state* workers = threadpool->workers;
|
|
@@ -2695,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2695
2719
|
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
|
2696
2720
|
}
|
|
2697
2721
|
if (n_threads <= 0) {
|
|
2698
|
-
n_threads = threadpool ? threadpool->
|
|
2722
|
+
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
|
2699
2723
|
}
|
|
2700
2724
|
|
|
2701
2725
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
|
@@ -2903,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2903
2927
|
|
|
2904
2928
|
struct ggml_compute_params params = {
|
|
2905
2929
|
/*.ith =*/ state->ith,
|
|
2906
|
-
/*.nth =*/ atomic_load_explicit(&tp->
|
|
2930
|
+
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
|
2907
2931
|
/*.wsize =*/ cplan->work_size,
|
|
2908
2932
|
/*.wdata =*/ cplan->work_data,
|
|
2909
2933
|
/*.threadpool=*/ tp,
|
|
2910
2934
|
};
|
|
2911
2935
|
|
|
2936
|
+
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2937
|
+
|
|
2912
2938
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
|
2913
2939
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
2914
2940
|
|
|
@@ -2930,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2930
2956
|
}
|
|
2931
2957
|
}
|
|
2932
2958
|
|
|
2959
|
+
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2960
|
+
|
|
2933
2961
|
ggml_barrier(state->threadpool);
|
|
2934
2962
|
|
|
2935
2963
|
return 0;
|
|
@@ -2937,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2937
2965
|
|
|
2938
2966
|
#ifndef GGML_USE_OPENMP
|
|
2939
2967
|
|
|
2940
|
-
// check if thread is active
|
|
2941
|
-
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
|
2942
|
-
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2943
|
-
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
|
2944
|
-
return (state->ith < n_threads);
|
|
2945
|
-
}
|
|
2946
|
-
|
|
2947
2968
|
// check if thread is ready to proceed (exit from polling or sleeping)
|
|
2969
|
+
// returns true if loops should exit, sets state->pending to indicate new work
|
|
2948
2970
|
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
|
2949
2971
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2950
2972
|
|
|
2951
2973
|
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
|
2952
2974
|
|
|
2953
2975
|
// check for new graph/work
|
|
2954
|
-
int
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
state->
|
|
2976
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
|
2977
|
+
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
|
2978
|
+
if (n_graph != state->last_graph) {
|
|
2979
|
+
state->pending = (state->ith < n_threads);
|
|
2980
|
+
state->last_graph = n_graph;
|
|
2981
|
+
return true;
|
|
2958
2982
|
}
|
|
2959
2983
|
|
|
2960
|
-
return
|
|
2984
|
+
return false;
|
|
2961
2985
|
}
|
|
2962
2986
|
|
|
2963
2987
|
// sync thread state after polling
|
|
@@ -2974,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
|
|
2974
2998
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
|
2975
2999
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2976
3000
|
|
|
2977
|
-
// Skip polling for unused threads
|
|
2978
|
-
if (!ggml_graph_compute_thread_active(state)) {
|
|
2979
|
-
return state->pending;
|
|
2980
|
-
}
|
|
2981
|
-
|
|
2982
3001
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
|
2983
3002
|
// Perhaps, we can adjust it dynamically based on load and things.
|
|
2984
3003
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
|
@@ -3040,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
|
|
3040
3059
|
ggml_graph_compute_check_for_work(state);
|
|
3041
3060
|
if (state->pending) {
|
|
3042
3061
|
state->pending = false;
|
|
3043
|
-
|
|
3044
3062
|
ggml_graph_compute_thread(state);
|
|
3045
3063
|
}
|
|
3046
3064
|
}
|
|
@@ -3055,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|
|
3055
3073
|
|
|
3056
3074
|
ggml_mutex_lock(&threadpool->mutex);
|
|
3057
3075
|
|
|
3058
|
-
|
|
3076
|
+
// Update the number of active threads and the graph count
|
|
3077
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
|
3078
|
+
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
|
3059
3079
|
|
|
3060
|
-
|
|
3061
|
-
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3080
|
+
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
|
3062
3081
|
|
|
3063
3082
|
// Indicate the graph is ready to be processed
|
|
3064
3083
|
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
|
3065
|
-
|
|
3084
|
+
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
|
3066
3085
|
|
|
3067
3086
|
if (threadpool->pause) {
|
|
3068
3087
|
// Update main thread prio and affinity to match the threadpool settings
|
|
@@ -3100,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3100
3119
|
threadpool->pause = tpp->paused;
|
|
3101
3120
|
threadpool->abort = -1;
|
|
3102
3121
|
threadpool->workers = NULL;
|
|
3103
|
-
threadpool->
|
|
3104
|
-
threadpool->n_threads_cur = tpp->n_threads;
|
|
3122
|
+
threadpool->n_threads = tpp->n_threads;
|
|
3105
3123
|
threadpool->poll = tpp->poll;
|
|
3106
3124
|
threadpool->prio = tpp->prio;
|
|
3107
3125
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
|
@@ -3196,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3196
3214
|
{
|
|
3197
3215
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
3198
3216
|
n_threads = omp_get_num_threads();
|
|
3199
|
-
atomic_store_explicit(&threadpool->
|
|
3217
|
+
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
|
3200
3218
|
}
|
|
3201
3219
|
|
|
3202
3220
|
// Apply thread CPU mask and priority
|
|
@@ -3209,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3209
3227
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3210
3228
|
}
|
|
3211
3229
|
} else {
|
|
3212
|
-
atomic_store_explicit(&threadpool->
|
|
3230
|
+
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
|
3213
3231
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
|
3214
3232
|
}
|
|
3215
3233
|
#else
|
|
3216
|
-
if (n_threads > threadpool->
|
|
3217
|
-
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->
|
|
3218
|
-
n_threads = threadpool->
|
|
3234
|
+
if (n_threads > threadpool->n_threads) {
|
|
3235
|
+
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
|
3236
|
+
n_threads = threadpool->n_threads;
|
|
3219
3237
|
}
|
|
3220
3238
|
|
|
3221
3239
|
// Kick all threads to start the new graph
|
|
@@ -3455,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
|
|
|
3455
3473
|
#endif
|
|
3456
3474
|
}
|
|
3457
3475
|
|
|
3476
|
+
int ggml_cpu_get_rvv_vlen(void) {
|
|
3477
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
3478
|
+
return ggml_riscv_arch_features.rvv_vlen;
|
|
3479
|
+
#else
|
|
3480
|
+
return 0;
|
|
3481
|
+
#endif
|
|
3482
|
+
}
|
|
3483
|
+
|
|
3458
3484
|
int ggml_cpu_has_f16c(void) {
|
|
3459
3485
|
#if defined(__F16C__)
|
|
3460
3486
|
return 1;
|
|
@@ -3621,6 +3647,10 @@ void ggml_cpu_init(void) {
|
|
|
3621
3647
|
ggml_init_arm_arch_features();
|
|
3622
3648
|
#endif
|
|
3623
3649
|
|
|
3650
|
+
#if defined(__riscv)
|
|
3651
|
+
ggml_init_riscv_arch_features();
|
|
3652
|
+
#endif
|
|
3653
|
+
|
|
3624
3654
|
is_first_call = false;
|
|
3625
3655
|
}
|
|
3626
3656
|
|
|
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
583
583
|
if (ggml_cpu_has_riscv_v()) {
|
|
584
584
|
features.push_back({ "RISCV_V", "1" });
|
|
585
585
|
}
|
|
586
|
+
if (ggml_cpu_get_rvv_vlen() > 0) {
|
|
587
|
+
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
|
588
|
+
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
|
589
|
+
}
|
|
586
590
|
if (ggml_cpu_has_vsx()) {
|
|
587
591
|
features.push_back({ "VSX", "1" });
|
|
588
592
|
}
|
|
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2169
2169
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2170
2170
|
|
|
2171
2171
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2172
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2172
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2173
|
+
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
2173
2174
|
if (cur->ne[1] % 8 == 0) {
|
|
2174
2175
|
return &q4_0_8x8_q8_0;
|
|
2175
2176
|
}
|
|
@@ -313,6 +313,7 @@ extern "C" {
|
|
|
313
313
|
bool check_tensors; // validate model tensor data
|
|
314
314
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
315
315
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
|
316
|
+
bool no_alloc; // only load metadata and simulate memory allocations
|
|
316
317
|
};
|
|
317
318
|
|
|
318
319
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -466,10 +467,24 @@ extern "C" {
|
|
|
466
467
|
// Frees all allocated memory
|
|
467
468
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
468
469
|
|
|
470
|
+
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
471
|
+
// returns true if the parameters could be successfully modified to fit device memory
|
|
472
|
+
// this function is NOT thread safe because it modifies the global llama logger state
|
|
473
|
+
LLAMA_API bool llama_params_fit(
|
|
474
|
+
const char * path_model,
|
|
475
|
+
struct llama_model_params * mparams,
|
|
476
|
+
struct llama_context_params * cparams,
|
|
477
|
+
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
|
478
|
+
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
|
479
|
+
size_t margin, // margin of memory to leave per device in bytes
|
|
480
|
+
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
|
481
|
+
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
|
482
|
+
|
|
469
483
|
LLAMA_API int64_t llama_time_us(void);
|
|
470
484
|
|
|
471
485
|
LLAMA_API size_t llama_max_devices(void);
|
|
472
486
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
|
487
|
+
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
|
473
488
|
|
|
474
489
|
LLAMA_API bool llama_supports_mmap (void);
|
|
475
490
|
LLAMA_API bool llama_supports_mlock (void);
|
|
@@ -1354,7 +1369,9 @@ extern "C" {
|
|
|
1354
1369
|
|
|
1355
1370
|
// Set callback for all future logging events.
|
|
1356
1371
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
1357
|
-
|
|
1372
|
+
// The logger state is global so these functions are NOT thread safe.
|
|
1373
|
+
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
1374
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
|
1358
1375
|
|
|
1359
1376
|
//
|
|
1360
1377
|
// Performance utils
|
|
@@ -67,7 +67,7 @@ add_library(llama
|
|
|
67
67
|
models/gemma-embedding.cpp
|
|
68
68
|
models/gemma.cpp
|
|
69
69
|
models/gemma2-iswa.cpp
|
|
70
|
-
models/gemma3
|
|
70
|
+
models/gemma3.cpp
|
|
71
71
|
models/gemma3n-iswa.cpp
|
|
72
72
|
models/glm4-moe.cpp
|
|
73
73
|
models/glm4.cpp
|
|
@@ -139,6 +139,7 @@ add_library(llama
|
|
|
139
139
|
set_target_properties(llama PROPERTIES
|
|
140
140
|
VERSION ${LLAMA_INSTALL_VERSION}
|
|
141
141
|
SOVERSION 0
|
|
142
|
+
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
|
142
143
|
)
|
|
143
144
|
|
|
144
145
|
target_include_directories(llama PRIVATE .)
|