@fugood/llama.node 1.4.7 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +22 -23
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +40 -16
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +91 -92
  23. package/src/llama.cpp/common/sampling.h +11 -6
  24. package/src/llama.cpp/common/speculative.cpp +1 -1
  25. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  26. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  27. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  29. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  30. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  35. package/src/llama.cpp/include/llama.h +18 -1
  36. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  37. package/src/llama.cpp/src/llama-arch.h +9 -2
  38. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  39. package/src/llama.cpp/src/llama-batch.h +4 -2
  40. package/src/llama.cpp/src/llama-context.cpp +93 -23
  41. package/src/llama.cpp/src/llama-context.h +8 -2
  42. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  43. package/src/llama.cpp/src/llama-graph.h +17 -4
  44. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  45. package/src/llama.cpp/src/llama-hparams.h +5 -1
  46. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  47. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  48. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  49. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  50. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  51. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  52. package/src/llama.cpp/src/llama-model.cpp +103 -44
  53. package/src/llama.cpp/src/llama-model.h +1 -0
  54. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  55. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  56. package/src/llama.cpp/src/llama.cpp +675 -1
  57. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  58. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  59. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  60. package/src/llama.cpp/src/models/models.h +5 -5
  61. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  62. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  63. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
48
48
  // arguments can be nullptr to skip printing
49
49
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
50
 
51
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
52
+
51
53
  // extended sampling implementation:
52
54
  //
53
55
  // - set logits
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
55
57
  // - check if the token fits the grammar (if any)
56
58
  // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
57
59
  //
58
- // if grammar_first is true, the grammar is applied before the samplers (slower)
59
- // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
60
- //
61
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
60
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
62
61
 
63
62
  // generalized version of common_sampler_sample
64
63
  //
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
76
75
  //
77
76
  // returns at least 1 token, up to idxs.size()
78
77
  //
79
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
78
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
80
79
 
81
80
  // assume idxs == [ 0, 1, 2, ..., draft.size() ]
82
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
81
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
83
82
 
84
83
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
85
84
 
@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
107
106
 
108
107
  llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
109
108
  const char * grammar_kind, const char * grammar_data);
109
+
110
+ struct common_sampler_deleter {
111
+ void operator()(common_sampler * s) { common_sampler_free(s); }
112
+ };
113
+
114
+ typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
315
315
  for (int i = 0; i < params.n_draft; ++i) {
316
316
  common_batch_clear(batch);
317
317
 
318
- common_sampler_sample(smpl, ctx_dft, 0, true);
318
+ common_sampler_sample(smpl, ctx_dft, 0);
319
319
 
320
320
  const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
54
54
  # TODO
55
55
  else()
56
56
  set(GGML_STANDALONE OFF)
57
+
58
+ if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
59
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
60
+ endif()
57
61
  endif()
58
62
 
59
63
  if (EMSCRIPTEN)
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
53
53
  // call with a worst-case graph to avoid buffer reallocations
54
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
55
55
  // returns false if the buffer allocation failed
56
+ // ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
56
57
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
58
+ GGML_API void ggml_gallocr_reserve_n_size(
59
+ ggml_gallocr_t galloc,
60
+ struct ggml_cgraph * graph,
61
+ const int * node_buffer_ids,
62
+ const int * leaf_buffer_ids,
63
+ size_t * sizes);
57
64
  GGML_API bool ggml_gallocr_reserve_n(
58
65
  ggml_gallocr_t galloc,
59
66
  struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
68
75
 
69
76
  // Utils
70
77
  // Create a buffer and allocate all the tensors in a ggml_context
78
+ // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
79
+ GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
71
80
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72
81
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
73
82
 
@@ -307,6 +307,7 @@ extern "C" {
307
307
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
308
308
 
309
309
  // Initialize backend buffers from a measure graph
310
+ GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
310
311
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311
312
 
312
313
  GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
@@ -99,6 +99,7 @@ extern "C" {
99
99
  GGML_BACKEND_API int ggml_cpu_has_sme (void);
100
100
  // other
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
+ GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
102
103
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
104
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
@@ -2305,13 +2305,11 @@ extern "C" {
2305
2305
  float stop,
2306
2306
  float step);
2307
2307
 
2308
- #define GGML_KQ_MASK_PAD 1
2309
-
2310
- // q: [n_embd_k, n_batch, n_head, ne3 ]
2311
- // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2312
- // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2313
- // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2314
- // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2308
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2309
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2310
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2311
+ // mask: [n_kv, n_batch, ne32, ne33]
2312
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2315
2313
  //
2316
2314
  // broadcast:
2317
2315
  // n_head % n_head_kv == 0
@@ -2617,7 +2615,8 @@ extern "C" {
2617
2615
 
2618
2616
  // Set callback for all future logging events.
2619
2617
  // If this is not called, or NULL is supplied, everything is output on stderr.
2620
- GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2618
+ GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2619
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2621
2620
 
2622
2621
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2623
2622
 
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
386
386
  ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
387
387
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
388
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
+ ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
389
392
  elseif (APPLE)
390
393
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
391
394
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
@@ -24,6 +24,7 @@
24
24
 
25
25
  #define UNUSED GGML_UNUSED
26
26
 
27
+ #if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
27
28
  static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
28
29
  int16x8_t * out_mins,
29
30
  int8_t * out_scales) {
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
46
47
  scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
47
48
  memcpy(out_scales, scales_u32, 8);
48
49
  }
50
+ #endif
49
51
 
50
52
  void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
51
53
  assert(QK8_0 == 32);
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
81
81
  } ggml_arm_arch_features = { 0 };
82
82
  #endif
83
83
 
84
+ #if defined(__riscv)
85
+ struct ggml_riscv_arch_features_type {
86
+ int rvv_vlen;
87
+ } ggml_riscv_arch_features = { 0 };
88
+ #endif
84
89
 
85
90
  #if defined(_WIN32)
86
91
 
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
187
192
 
188
193
  typedef pthread_t ggml_thread_t;
189
194
 
195
+ #define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
196
+ #define GGML_THREADPOOL_N_THREADS_BITS (16)
197
+
190
198
  #if defined(__APPLE__)
191
199
  #include <unistd.h>
192
200
  #include <mach/mach.h>
@@ -449,7 +457,7 @@ struct ggml_threadpool {
449
457
  struct ggml_cplan * cplan;
450
458
 
451
459
  // synchronization primitives
452
- atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
460
+ atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
453
461
  atomic_int GGML_CACHE_ALIGN n_barrier;
454
462
  atomic_int GGML_CACHE_ALIGN n_barrier_passed;
455
463
  atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -457,12 +465,10 @@ struct ggml_threadpool {
457
465
  // these are atomic as an annotation for thread-sanitizer
458
466
  atomic_bool stop; // Used for stopping the threadpool altogether
459
467
  atomic_bool pause; // Used for pausing the threadpool or individual threads
460
- atomic_int abort; // Used for aborting processing of a graph
468
+ atomic_int abort; // Used for aborting processing of a graph
461
469
 
462
470
  struct ggml_compute_state * workers; // per thread state
463
- int n_threads_max; // number of threads in the pool
464
- atomic_int n_threads_cur; // number of threads used in the current graph
465
-
471
+ int n_threads; // Number of threads in the pool
466
472
  int32_t prio; // Scheduling priority
467
473
  uint32_t poll; // Polling level (0 - no polling)
468
474
 
@@ -539,7 +545,7 @@ struct ggml_state {
539
545
  static struct ggml_state g_state = {0};
540
546
 
541
547
  void ggml_barrier(struct ggml_threadpool * tp) {
542
- int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
548
+ int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
543
549
  if (n_threads == 1) {
544
550
  return;
545
551
  }
@@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
556
562
  // last thread
557
563
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
558
564
 
559
- // exit barrier (fill seq-cst fence)
565
+ // exit barrier (full seq-cst fence)
560
566
  atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
561
567
  return;
562
568
  }
@@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
702
708
  #endif
703
709
  #endif // __ARM_ARCH
704
710
 
711
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
712
+ #include <riscv_vector.h>
713
+ static void ggml_init_riscv_arch_features(void) {
714
+ ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
715
+ }
716
+ #else
717
+ static void ggml_init_riscv_arch_features(void) {}
718
+ #endif
719
+
705
720
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
706
721
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
707
722
 
@@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
2628
2643
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
2629
2644
  if (!threadpool) return;
2630
2645
 
2631
- const int n_threads = threadpool->n_threads_max;
2646
+ const int n_threads = threadpool->n_threads;
2632
2647
 
2633
2648
  #ifndef GGML_USE_OPENMP
2634
2649
  struct ggml_compute_state* workers = threadpool->workers;
@@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
2704
2719
  //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
2705
2720
  }
2706
2721
  if (n_threads <= 0) {
2707
- n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
2722
+ n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
2708
2723
  }
2709
2724
 
2710
2725
  #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2912
2927
 
2913
2928
  struct ggml_compute_params params = {
2914
2929
  /*.ith =*/ state->ith,
2915
- /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2930
+ /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
2916
2931
  /*.wsize =*/ cplan->work_size,
2917
2932
  /*.wdata =*/ cplan->work_data,
2918
2933
  /*.threadpool=*/ tp,
2919
2934
  };
2920
2935
 
2936
+ GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2937
+
2921
2938
  for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2922
2939
  struct ggml_tensor * node = cgraph->nodes[node_n];
2923
2940
 
@@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2939
2956
  }
2940
2957
  }
2941
2958
 
2959
+ GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2960
+
2942
2961
  ggml_barrier(state->threadpool);
2943
2962
 
2944
2963
  return 0;
@@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2946
2965
 
2947
2966
  #ifndef GGML_USE_OPENMP
2948
2967
 
2949
- // check if thread is active
2950
- static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
2951
- struct ggml_threadpool * threadpool = state->threadpool;
2952
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
2953
- return (state->ith < n_threads);
2954
- }
2955
-
2956
2968
  // check if thread is ready to proceed (exit from polling or sleeping)
2969
+ // returns true if loops should exit, sets state->pending to indicate new work
2957
2970
  static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
2958
2971
  struct ggml_threadpool * threadpool = state->threadpool;
2959
2972
 
2960
2973
  if (state->pending || threadpool->stop || threadpool->pause) { return true; }
2961
2974
 
2962
2975
  // check for new graph/work
2963
- int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2964
- if (new_graph != state->last_graph) {
2965
- state->pending = ggml_graph_compute_thread_active(state);
2966
- state->last_graph = new_graph;
2976
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2977
+ int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
2978
+ if (n_graph != state->last_graph) {
2979
+ state->pending = (state->ith < n_threads);
2980
+ state->last_graph = n_graph;
2981
+ return true;
2967
2982
  }
2968
2983
 
2969
- return state->pending;
2984
+ return false;
2970
2985
  }
2971
2986
 
2972
2987
  // sync thread state after polling
@@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
2983
2998
  static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
2984
2999
  struct ggml_threadpool * threadpool = state->threadpool;
2985
3000
 
2986
- // Skip polling for unused threads
2987
- if (!ggml_graph_compute_thread_active(state)) {
2988
- return state->pending;
2989
- }
2990
-
2991
3001
  // This seems to make 0 ... 100 a decent range for polling level across modern processors.
2992
3002
  // Perhaps, we can adjust it dynamically based on load and things.
2993
3003
  const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
3049
3059
  ggml_graph_compute_check_for_work(state);
3050
3060
  if (state->pending) {
3051
3061
  state->pending = false;
3052
-
3053
3062
  ggml_graph_compute_thread(state);
3054
3063
  }
3055
3064
  }
@@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
3064
3073
 
3065
3074
  ggml_mutex_lock(&threadpool->mutex);
3066
3075
 
3067
- GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
3076
+ // Update the number of active threads and the graph count
3077
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
3078
+ n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
3068
3079
 
3069
- // Update the number of active threads
3070
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3080
+ GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
3071
3081
 
3072
3082
  // Indicate the graph is ready to be processed
3073
3083
  // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3074
- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
3084
+ atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
3075
3085
 
3076
3086
  if (threadpool->pause) {
3077
3087
  // Update main thread prio and affinity to match the threadpool settings
@@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3109
3119
  threadpool->pause = tpp->paused;
3110
3120
  threadpool->abort = -1;
3111
3121
  threadpool->workers = NULL;
3112
- threadpool->n_threads_max = tpp->n_threads;
3113
- threadpool->n_threads_cur = tpp->n_threads;
3122
+ threadpool->n_threads = tpp->n_threads;
3114
3123
  threadpool->poll = tpp->poll;
3115
3124
  threadpool->prio = tpp->prio;
3116
3125
  threadpool->ec = GGML_STATUS_SUCCESS;
@@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3205
3214
  {
3206
3215
  // update the number of threads from the actual number of threads that we got from OpenMP
3207
3216
  n_threads = omp_get_num_threads();
3208
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3217
+ atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
3209
3218
  }
3210
3219
 
3211
3220
  // Apply thread CPU mask and priority
@@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3218
3227
  ggml_graph_compute_thread(&threadpool->workers[ith]);
3219
3228
  }
3220
3229
  } else {
3221
- atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
3230
+ atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
3222
3231
  ggml_graph_compute_thread(&threadpool->workers[0]);
3223
3232
  }
3224
3233
  #else
3225
- if (n_threads > threadpool->n_threads_max) {
3226
- GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
3227
- n_threads = threadpool->n_threads_max;
3234
+ if (n_threads > threadpool->n_threads) {
3235
+ GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
3236
+ n_threads = threadpool->n_threads;
3228
3237
  }
3229
3238
 
3230
3239
  // Kick all threads to start the new graph
@@ -3464,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
3464
3473
  #endif
3465
3474
  }
3466
3475
 
3476
+ int ggml_cpu_get_rvv_vlen(void) {
3477
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
3478
+ return ggml_riscv_arch_features.rvv_vlen;
3479
+ #else
3480
+ return 0;
3481
+ #endif
3482
+ }
3483
+
3467
3484
  int ggml_cpu_has_f16c(void) {
3468
3485
  #if defined(__F16C__)
3469
3486
  return 1;
@@ -3630,6 +3647,10 @@ void ggml_cpu_init(void) {
3630
3647
  ggml_init_arm_arch_features();
3631
3648
  #endif
3632
3649
 
3650
+ #if defined(__riscv)
3651
+ ggml_init_riscv_arch_features();
3652
+ #endif
3653
+
3633
3654
  is_first_call = false;
3634
3655
  }
3635
3656
 
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
583
583
  if (ggml_cpu_has_riscv_v()) {
584
584
  features.push_back({ "RISCV_V", "1" });
585
585
  }
586
+ if (ggml_cpu_get_rvv_vlen() > 0) {
587
+ static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
588
+ features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
589
+ }
586
590
  if (ggml_cpu_has_vsx()) {
587
591
  features.push_back({ "VSX", "1" });
588
592
  }
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2169
2169
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2170
2170
 
2171
2171
  if (cur->type == GGML_TYPE_Q4_0) {
2172
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2172
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2173
+ || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
2173
2174
  if (cur->ne[1] % 8 == 0) {
2174
2175
  return &q4_0_8x8_q8_0;
2175
2176
  }
@@ -313,6 +313,7 @@ extern "C" {
313
313
  bool check_tensors; // validate model tensor data
314
314
  bool use_extra_bufts; // use extra buffer types (used for weight repacking)
315
315
  bool no_host; // bypass host buffer allowing extra buffers to be used
316
+ bool no_alloc; // only load metadata and simulate memory allocations
316
317
  };
317
318
 
318
319
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -466,10 +467,24 @@ extern "C" {
466
467
  // Frees all allocated memory
467
468
  LLAMA_API void llama_free(struct llama_context * ctx);
468
469
 
470
+ // fits mparams and cparams to free device memory (assumes system memory is unlimited)
471
+ // returns true if the parameters could be successfully modified to fit device memory
472
+ // this function is NOT thread safe because it modifies the global llama logger state
473
+ LLAMA_API bool llama_params_fit(
474
+ const char * path_model,
475
+ struct llama_model_params * mparams,
476
+ struct llama_context_params * cparams,
477
+ float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
478
+ struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
479
+ size_t margin, // margin of memory to leave per device in bytes
480
+ uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
481
+ enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
482
+
469
483
  LLAMA_API int64_t llama_time_us(void);
470
484
 
471
485
  LLAMA_API size_t llama_max_devices(void);
472
486
  LLAMA_API size_t llama_max_parallel_sequences(void);
487
+ LLAMA_API size_t llama_max_tensor_buft_overrides(void);
473
488
 
474
489
  LLAMA_API bool llama_supports_mmap (void);
475
490
  LLAMA_API bool llama_supports_mlock (void);
@@ -1354,7 +1369,9 @@ extern "C" {
1354
1369
 
1355
1370
  // Set callback for all future logging events.
1356
1371
  // If this is not called, or NULL is supplied, everything is output on stderr.
1357
- LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1372
+ // The logger state is global so these functions are NOT thread safe.
1373
+ LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1374
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1358
1375
 
1359
1376
  //
1360
1377
  // Performance utils