@fugood/llama.node 1.4.7 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +22 -23
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +103 -44
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
|
48
48
|
// arguments can be nullptr to skip printing
|
|
49
49
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
50
|
|
|
51
|
+
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
52
|
+
|
|
51
53
|
// extended sampling implementation:
|
|
52
54
|
//
|
|
53
55
|
// - set logits
|
|
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|
|
55
57
|
// - check if the token fits the grammar (if any)
|
|
56
58
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
|
57
59
|
//
|
|
58
|
-
|
|
59
|
-
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
60
|
-
//
|
|
61
|
-
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
60
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
|
62
61
|
|
|
63
62
|
// generalized version of common_sampler_sample
|
|
64
63
|
//
|
|
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
76
75
|
//
|
|
77
76
|
// returns at least 1 token, up to idxs.size()
|
|
78
77
|
//
|
|
79
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft
|
|
78
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
|
80
79
|
|
|
81
80
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
|
82
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft
|
|
81
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
|
83
82
|
|
|
84
83
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
85
84
|
|
|
@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
|
|
107
106
|
|
|
108
107
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
|
109
108
|
const char * grammar_kind, const char * grammar_data);
|
|
109
|
+
|
|
110
|
+
struct common_sampler_deleter {
|
|
111
|
+
void operator()(common_sampler * s) { common_sampler_free(s); }
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
|
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
315
315
|
for (int i = 0; i < params.n_draft; ++i) {
|
|
316
316
|
common_batch_clear(batch);
|
|
317
317
|
|
|
318
|
-
common_sampler_sample(smpl, ctx_dft, 0
|
|
318
|
+
common_sampler_sample(smpl, ctx_dft, 0);
|
|
319
319
|
|
|
320
320
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
|
|
54
54
|
# TODO
|
|
55
55
|
else()
|
|
56
56
|
set(GGML_STANDALONE OFF)
|
|
57
|
+
|
|
58
|
+
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
|
59
|
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
60
|
+
endif()
|
|
57
61
|
endif()
|
|
58
62
|
|
|
59
63
|
if (EMSCRIPTEN)
|
|
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
|
53
53
|
// call with a worst-case graph to avoid buffer reallocations
|
|
54
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
|
55
55
|
// returns false if the buffer allocation failed
|
|
56
|
+
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
|
56
57
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
|
58
|
+
GGML_API void ggml_gallocr_reserve_n_size(
|
|
59
|
+
ggml_gallocr_t galloc,
|
|
60
|
+
struct ggml_cgraph * graph,
|
|
61
|
+
const int * node_buffer_ids,
|
|
62
|
+
const int * leaf_buffer_ids,
|
|
63
|
+
size_t * sizes);
|
|
57
64
|
GGML_API bool ggml_gallocr_reserve_n(
|
|
58
65
|
ggml_gallocr_t galloc,
|
|
59
66
|
struct ggml_cgraph * graph,
|
|
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
|
|
68
75
|
|
|
69
76
|
// Utils
|
|
70
77
|
// Create a buffer and allocate all the tensors in a ggml_context
|
|
78
|
+
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
|
79
|
+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
71
80
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
72
81
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
|
73
82
|
|
|
@@ -307,6 +307,7 @@ extern "C" {
|
|
|
307
307
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
308
308
|
|
|
309
309
|
// Initialize backend buffers from a measure graph
|
|
310
|
+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
|
310
311
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
|
311
312
|
|
|
312
313
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
|
@@ -99,6 +99,7 @@ extern "C" {
|
|
|
99
99
|
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
|
100
100
|
// other
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
|
+
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
|
102
103
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
104
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
@@ -2305,13 +2305,11 @@ extern "C" {
|
|
|
2305
2305
|
float stop,
|
|
2306
2306
|
float step);
|
|
2307
2307
|
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
//
|
|
2311
|
-
//
|
|
2312
|
-
//
|
|
2313
|
-
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
2314
|
-
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2308
|
+
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
|
2309
|
+
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
|
2310
|
+
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
|
2311
|
+
// mask: [n_kv, n_batch, ne32, ne33]
|
|
2312
|
+
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2315
2313
|
//
|
|
2316
2314
|
// broadcast:
|
|
2317
2315
|
// n_head % n_head_kv == 0
|
|
@@ -2617,7 +2615,8 @@ extern "C" {
|
|
|
2617
2615
|
|
|
2618
2616
|
// Set callback for all future logging events.
|
|
2619
2617
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
2620
|
-
GGML_API void
|
|
2618
|
+
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
2619
|
+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
|
2621
2620
|
|
|
2622
2621
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
2623
2622
|
|
|
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
386
386
|
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
387
387
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
388
388
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
389
|
+
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
|
390
|
+
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
|
391
|
+
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
389
392
|
elseif (APPLE)
|
|
390
393
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
391
394
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
|
|
25
25
|
#define UNUSED GGML_UNUSED
|
|
26
26
|
|
|
27
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
|
27
28
|
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|
28
29
|
int16x8_t * out_mins,
|
|
29
30
|
int8_t * out_scales) {
|
|
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|
|
46
47
|
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
|
47
48
|
memcpy(out_scales, scales_u32, 8);
|
|
48
49
|
}
|
|
50
|
+
#endif
|
|
49
51
|
|
|
50
52
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
51
53
|
assert(QK8_0 == 32);
|
|
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
|
|
|
81
81
|
} ggml_arm_arch_features = { 0 };
|
|
82
82
|
#endif
|
|
83
83
|
|
|
84
|
+
#if defined(__riscv)
|
|
85
|
+
struct ggml_riscv_arch_features_type {
|
|
86
|
+
int rvv_vlen;
|
|
87
|
+
} ggml_riscv_arch_features = { 0 };
|
|
88
|
+
#endif
|
|
84
89
|
|
|
85
90
|
#if defined(_WIN32)
|
|
86
91
|
|
|
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
|
|
|
187
192
|
|
|
188
193
|
typedef pthread_t ggml_thread_t;
|
|
189
194
|
|
|
195
|
+
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
|
196
|
+
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
|
197
|
+
|
|
190
198
|
#if defined(__APPLE__)
|
|
191
199
|
#include <unistd.h>
|
|
192
200
|
#include <mach/mach.h>
|
|
@@ -449,7 +457,7 @@ struct ggml_threadpool {
|
|
|
449
457
|
struct ggml_cplan * cplan;
|
|
450
458
|
|
|
451
459
|
// synchronization primitives
|
|
452
|
-
atomic_int n_graph; //
|
|
460
|
+
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
|
453
461
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
|
454
462
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
|
455
463
|
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
@@ -457,12 +465,10 @@ struct ggml_threadpool {
|
|
|
457
465
|
// these are atomic as an annotation for thread-sanitizer
|
|
458
466
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
459
467
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
|
460
|
-
atomic_int
|
|
468
|
+
atomic_int abort; // Used for aborting processing of a graph
|
|
461
469
|
|
|
462
470
|
struct ggml_compute_state * workers; // per thread state
|
|
463
|
-
int
|
|
464
|
-
atomic_int n_threads_cur; // number of threads used in the current graph
|
|
465
|
-
|
|
471
|
+
int n_threads; // Number of threads in the pool
|
|
466
472
|
int32_t prio; // Scheduling priority
|
|
467
473
|
uint32_t poll; // Polling level (0 - no polling)
|
|
468
474
|
|
|
@@ -539,7 +545,7 @@ struct ggml_state {
|
|
|
539
545
|
static struct ggml_state g_state = {0};
|
|
540
546
|
|
|
541
547
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
|
542
|
-
int n_threads = atomic_load_explicit(&tp->
|
|
548
|
+
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
|
543
549
|
if (n_threads == 1) {
|
|
544
550
|
return;
|
|
545
551
|
}
|
|
@@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|
|
556
562
|
// last thread
|
|
557
563
|
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
|
558
564
|
|
|
559
|
-
// exit barrier (
|
|
565
|
+
// exit barrier (full seq-cst fence)
|
|
560
566
|
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
|
561
567
|
return;
|
|
562
568
|
}
|
|
@@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
|
|
|
702
708
|
#endif
|
|
703
709
|
#endif // __ARM_ARCH
|
|
704
710
|
|
|
711
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
712
|
+
#include <riscv_vector.h>
|
|
713
|
+
static void ggml_init_riscv_arch_features(void) {
|
|
714
|
+
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
|
715
|
+
}
|
|
716
|
+
#else
|
|
717
|
+
static void ggml_init_riscv_arch_features(void) {}
|
|
718
|
+
#endif
|
|
719
|
+
|
|
705
720
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
|
706
721
|
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
|
707
722
|
|
|
@@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|
|
2628
2643
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
|
2629
2644
|
if (!threadpool) return;
|
|
2630
2645
|
|
|
2631
|
-
const int n_threads = threadpool->
|
|
2646
|
+
const int n_threads = threadpool->n_threads;
|
|
2632
2647
|
|
|
2633
2648
|
#ifndef GGML_USE_OPENMP
|
|
2634
2649
|
struct ggml_compute_state* workers = threadpool->workers;
|
|
@@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2704
2719
|
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
|
2705
2720
|
}
|
|
2706
2721
|
if (n_threads <= 0) {
|
|
2707
|
-
n_threads = threadpool ? threadpool->
|
|
2722
|
+
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
|
2708
2723
|
}
|
|
2709
2724
|
|
|
2710
2725
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
|
@@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2912
2927
|
|
|
2913
2928
|
struct ggml_compute_params params = {
|
|
2914
2929
|
/*.ith =*/ state->ith,
|
|
2915
|
-
/*.nth =*/ atomic_load_explicit(&tp->
|
|
2930
|
+
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
|
2916
2931
|
/*.wsize =*/ cplan->work_size,
|
|
2917
2932
|
/*.wdata =*/ cplan->work_data,
|
|
2918
2933
|
/*.threadpool=*/ tp,
|
|
2919
2934
|
};
|
|
2920
2935
|
|
|
2936
|
+
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2937
|
+
|
|
2921
2938
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
|
2922
2939
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
2923
2940
|
|
|
@@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2939
2956
|
}
|
|
2940
2957
|
}
|
|
2941
2958
|
|
|
2959
|
+
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2960
|
+
|
|
2942
2961
|
ggml_barrier(state->threadpool);
|
|
2943
2962
|
|
|
2944
2963
|
return 0;
|
|
@@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2946
2965
|
|
|
2947
2966
|
#ifndef GGML_USE_OPENMP
|
|
2948
2967
|
|
|
2949
|
-
// check if thread is active
|
|
2950
|
-
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
|
2951
|
-
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2952
|
-
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
|
2953
|
-
return (state->ith < n_threads);
|
|
2954
|
-
}
|
|
2955
|
-
|
|
2956
2968
|
// check if thread is ready to proceed (exit from polling or sleeping)
|
|
2969
|
+
// returns true if loops should exit, sets state->pending to indicate new work
|
|
2957
2970
|
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
|
2958
2971
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2959
2972
|
|
|
2960
2973
|
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
|
2961
2974
|
|
|
2962
2975
|
// check for new graph/work
|
|
2963
|
-
int
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
state->
|
|
2976
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
|
2977
|
+
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
|
2978
|
+
if (n_graph != state->last_graph) {
|
|
2979
|
+
state->pending = (state->ith < n_threads);
|
|
2980
|
+
state->last_graph = n_graph;
|
|
2981
|
+
return true;
|
|
2967
2982
|
}
|
|
2968
2983
|
|
|
2969
|
-
return
|
|
2984
|
+
return false;
|
|
2970
2985
|
}
|
|
2971
2986
|
|
|
2972
2987
|
// sync thread state after polling
|
|
@@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
|
|
2983
2998
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
|
2984
2999
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2985
3000
|
|
|
2986
|
-
// Skip polling for unused threads
|
|
2987
|
-
if (!ggml_graph_compute_thread_active(state)) {
|
|
2988
|
-
return state->pending;
|
|
2989
|
-
}
|
|
2990
|
-
|
|
2991
3001
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
|
2992
3002
|
// Perhaps, we can adjust it dynamically based on load and things.
|
|
2993
3003
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
|
@@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
|
|
3049
3059
|
ggml_graph_compute_check_for_work(state);
|
|
3050
3060
|
if (state->pending) {
|
|
3051
3061
|
state->pending = false;
|
|
3052
|
-
|
|
3053
3062
|
ggml_graph_compute_thread(state);
|
|
3054
3063
|
}
|
|
3055
3064
|
}
|
|
@@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|
|
3064
3073
|
|
|
3065
3074
|
ggml_mutex_lock(&threadpool->mutex);
|
|
3066
3075
|
|
|
3067
|
-
|
|
3076
|
+
// Update the number of active threads and the graph count
|
|
3077
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
|
3078
|
+
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
|
3068
3079
|
|
|
3069
|
-
|
|
3070
|
-
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3080
|
+
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
|
3071
3081
|
|
|
3072
3082
|
// Indicate the graph is ready to be processed
|
|
3073
3083
|
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
|
3074
|
-
|
|
3084
|
+
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
|
3075
3085
|
|
|
3076
3086
|
if (threadpool->pause) {
|
|
3077
3087
|
// Update main thread prio and affinity to match the threadpool settings
|
|
@@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3109
3119
|
threadpool->pause = tpp->paused;
|
|
3110
3120
|
threadpool->abort = -1;
|
|
3111
3121
|
threadpool->workers = NULL;
|
|
3112
|
-
threadpool->
|
|
3113
|
-
threadpool->n_threads_cur = tpp->n_threads;
|
|
3122
|
+
threadpool->n_threads = tpp->n_threads;
|
|
3114
3123
|
threadpool->poll = tpp->poll;
|
|
3115
3124
|
threadpool->prio = tpp->prio;
|
|
3116
3125
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
|
@@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3205
3214
|
{
|
|
3206
3215
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
3207
3216
|
n_threads = omp_get_num_threads();
|
|
3208
|
-
atomic_store_explicit(&threadpool->
|
|
3217
|
+
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
|
3209
3218
|
}
|
|
3210
3219
|
|
|
3211
3220
|
// Apply thread CPU mask and priority
|
|
@@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3218
3227
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3219
3228
|
}
|
|
3220
3229
|
} else {
|
|
3221
|
-
atomic_store_explicit(&threadpool->
|
|
3230
|
+
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
|
3222
3231
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
|
3223
3232
|
}
|
|
3224
3233
|
#else
|
|
3225
|
-
if (n_threads > threadpool->
|
|
3226
|
-
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->
|
|
3227
|
-
n_threads = threadpool->
|
|
3234
|
+
if (n_threads > threadpool->n_threads) {
|
|
3235
|
+
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
|
3236
|
+
n_threads = threadpool->n_threads;
|
|
3228
3237
|
}
|
|
3229
3238
|
|
|
3230
3239
|
// Kick all threads to start the new graph
|
|
@@ -3464,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
|
|
|
3464
3473
|
#endif
|
|
3465
3474
|
}
|
|
3466
3475
|
|
|
3476
|
+
int ggml_cpu_get_rvv_vlen(void) {
|
|
3477
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
3478
|
+
return ggml_riscv_arch_features.rvv_vlen;
|
|
3479
|
+
#else
|
|
3480
|
+
return 0;
|
|
3481
|
+
#endif
|
|
3482
|
+
}
|
|
3483
|
+
|
|
3467
3484
|
int ggml_cpu_has_f16c(void) {
|
|
3468
3485
|
#if defined(__F16C__)
|
|
3469
3486
|
return 1;
|
|
@@ -3630,6 +3647,10 @@ void ggml_cpu_init(void) {
|
|
|
3630
3647
|
ggml_init_arm_arch_features();
|
|
3631
3648
|
#endif
|
|
3632
3649
|
|
|
3650
|
+
#if defined(__riscv)
|
|
3651
|
+
ggml_init_riscv_arch_features();
|
|
3652
|
+
#endif
|
|
3653
|
+
|
|
3633
3654
|
is_first_call = false;
|
|
3634
3655
|
}
|
|
3635
3656
|
|
|
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
583
583
|
if (ggml_cpu_has_riscv_v()) {
|
|
584
584
|
features.push_back({ "RISCV_V", "1" });
|
|
585
585
|
}
|
|
586
|
+
if (ggml_cpu_get_rvv_vlen() > 0) {
|
|
587
|
+
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
|
588
|
+
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
|
589
|
+
}
|
|
586
590
|
if (ggml_cpu_has_vsx()) {
|
|
587
591
|
features.push_back({ "VSX", "1" });
|
|
588
592
|
}
|
|
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2169
2169
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2170
2170
|
|
|
2171
2171
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2172
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2172
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2173
|
+
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
2173
2174
|
if (cur->ne[1] % 8 == 0) {
|
|
2174
2175
|
return &q4_0_8x8_q8_0;
|
|
2175
2176
|
}
|
|
@@ -313,6 +313,7 @@ extern "C" {
|
|
|
313
313
|
bool check_tensors; // validate model tensor data
|
|
314
314
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
315
315
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
|
316
|
+
bool no_alloc; // only load metadata and simulate memory allocations
|
|
316
317
|
};
|
|
317
318
|
|
|
318
319
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -466,10 +467,24 @@ extern "C" {
|
|
|
466
467
|
// Frees all allocated memory
|
|
467
468
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
468
469
|
|
|
470
|
+
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
471
|
+
// returns true if the parameters could be successfully modified to fit device memory
|
|
472
|
+
// this function is NOT thread safe because it modifies the global llama logger state
|
|
473
|
+
LLAMA_API bool llama_params_fit(
|
|
474
|
+
const char * path_model,
|
|
475
|
+
struct llama_model_params * mparams,
|
|
476
|
+
struct llama_context_params * cparams,
|
|
477
|
+
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
|
478
|
+
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
|
479
|
+
size_t margin, // margin of memory to leave per device in bytes
|
|
480
|
+
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
|
481
|
+
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
|
482
|
+
|
|
469
483
|
LLAMA_API int64_t llama_time_us(void);
|
|
470
484
|
|
|
471
485
|
LLAMA_API size_t llama_max_devices(void);
|
|
472
486
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
|
487
|
+
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
|
473
488
|
|
|
474
489
|
LLAMA_API bool llama_supports_mmap (void);
|
|
475
490
|
LLAMA_API bool llama_supports_mlock (void);
|
|
@@ -1354,7 +1369,9 @@ extern "C" {
|
|
|
1354
1369
|
|
|
1355
1370
|
// Set callback for all future logging events.
|
|
1356
1371
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
1357
|
-
|
|
1372
|
+
// The logger state is global so these functions are NOT thread safe.
|
|
1373
|
+
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
1374
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
|
1358
1375
|
|
|
1359
1376
|
//
|
|
1360
1377
|
// Performance utils
|