@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
81
81
  } ggml_arm_arch_features = { 0 };
82
82
  #endif
83
83
 
84
+ #if defined(__riscv)
85
+ struct ggml_riscv_arch_features_type {
86
+ int rvv_vlen;
87
+ } ggml_riscv_arch_features = { 0 };
88
+ #endif
84
89
 
85
90
  #if defined(_WIN32)
86
91
 
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
187
192
 
188
193
  typedef pthread_t ggml_thread_t;
189
194
 
195
+ #define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
196
+ #define GGML_THREADPOOL_N_THREADS_BITS (16)
197
+
190
198
  #if defined(__APPLE__)
191
199
  #include <unistd.h>
192
200
  #include <mach/mach.h>
@@ -449,7 +457,7 @@ struct ggml_threadpool {
449
457
  struct ggml_cplan * cplan;
450
458
 
451
459
  // synchronization primitives
452
- atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
460
+ atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
453
461
  atomic_int GGML_CACHE_ALIGN n_barrier;
454
462
  atomic_int GGML_CACHE_ALIGN n_barrier_passed;
455
463
  atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -457,12 +465,10 @@ struct ggml_threadpool {
457
465
  // these are atomic as an annotation for thread-sanitizer
458
466
  atomic_bool stop; // Used for stopping the threadpool altogether
459
467
  atomic_bool pause; // Used for pausing the threadpool or individual threads
460
- atomic_int abort; // Used for aborting processing of a graph
468
+ atomic_int abort; // Used for aborting processing of a graph
461
469
 
462
470
  struct ggml_compute_state * workers; // per thread state
463
- int n_threads_max; // number of threads in the pool
464
- atomic_int n_threads_cur; // number of threads used in the current graph
465
-
471
+ int n_threads; // Number of threads in the pool
466
472
  int32_t prio; // Scheduling priority
467
473
  uint32_t poll; // Polling level (0 - no polling)
468
474
 
@@ -490,6 +496,15 @@ static inline void ggml_thread_cpu_relax(void) {
490
496
  static inline void ggml_thread_cpu_relax(void) {
491
497
  _mm_pause();
492
498
  }
499
+ #elif defined(__riscv)
500
+ static inline void ggml_thread_cpu_relax(void) {
501
+ #ifdef __riscv_zihintpause
502
+ __asm__ __volatile__ ("pause");
503
+ #else
504
+ /* Encoding of the pause instruction */
505
+ __asm__ __volatile__ (".4byte 0x100000F");
506
+ #endif
507
+ }
493
508
  #else
494
509
  static inline void ggml_thread_cpu_relax(void) {;}
495
510
  #endif
@@ -530,7 +545,7 @@ struct ggml_state {
530
545
  static struct ggml_state g_state = {0};
531
546
 
532
547
  void ggml_barrier(struct ggml_threadpool * tp) {
533
- int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
548
+ int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
534
549
  if (n_threads == 1) {
535
550
  return;
536
551
  }
@@ -547,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
547
562
  // last thread
548
563
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
549
564
 
550
- // exit barrier (fill seq-cst fence)
565
+ // exit barrier (full seq-cst fence)
551
566
  atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
552
567
  return;
553
568
  }
@@ -693,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
693
708
  #endif
694
709
  #endif // __ARM_ARCH
695
710
 
711
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
712
+ #include <riscv_vector.h>
713
+ static void ggml_init_riscv_arch_features(void) {
714
+ ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
715
+ }
716
+ #else
717
+ static void ggml_init_riscv_arch_features(void) {}
718
+ #endif
719
+
696
720
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
697
721
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
698
722
 
@@ -2619,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
2619
2643
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
2620
2644
  if (!threadpool) return;
2621
2645
 
2622
- const int n_threads = threadpool->n_threads_max;
2646
+ const int n_threads = threadpool->n_threads;
2623
2647
 
2624
2648
  #ifndef GGML_USE_OPENMP
2625
2649
  struct ggml_compute_state* workers = threadpool->workers;
@@ -2695,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
2695
2719
  //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
2696
2720
  }
2697
2721
  if (n_threads <= 0) {
2698
- n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
2722
+ n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
2699
2723
  }
2700
2724
 
2701
2725
  #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -2903,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2903
2927
 
2904
2928
  struct ggml_compute_params params = {
2905
2929
  /*.ith =*/ state->ith,
2906
- /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2930
+ /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
2907
2931
  /*.wsize =*/ cplan->work_size,
2908
2932
  /*.wdata =*/ cplan->work_data,
2909
2933
  /*.threadpool=*/ tp,
2910
2934
  };
2911
2935
 
2936
+ GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2937
+
2912
2938
  for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2913
2939
  struct ggml_tensor * node = cgraph->nodes[node_n];
2914
2940
 
@@ -2930,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2930
2956
  }
2931
2957
  }
2932
2958
 
2959
+ GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2960
+
2933
2961
  ggml_barrier(state->threadpool);
2934
2962
 
2935
2963
  return 0;
@@ -2937,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2937
2965
 
2938
2966
  #ifndef GGML_USE_OPENMP
2939
2967
 
2940
- // check if thread is active
2941
- static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
2942
- struct ggml_threadpool * threadpool = state->threadpool;
2943
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
2944
- return (state->ith < n_threads);
2945
- }
2946
-
2947
2968
  // check if thread is ready to proceed (exit from polling or sleeping)
2969
+ // returns true if loops should exit, sets state->pending to indicate new work
2948
2970
  static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
2949
2971
  struct ggml_threadpool * threadpool = state->threadpool;
2950
2972
 
2951
2973
  if (state->pending || threadpool->stop || threadpool->pause) { return true; }
2952
2974
 
2953
2975
  // check for new graph/work
2954
- int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2955
- if (new_graph != state->last_graph) {
2956
- state->pending = ggml_graph_compute_thread_active(state);
2957
- state->last_graph = new_graph;
2976
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2977
+ int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
2978
+ if (n_graph != state->last_graph) {
2979
+ state->pending = (state->ith < n_threads);
2980
+ state->last_graph = n_graph;
2981
+ return true;
2958
2982
  }
2959
2983
 
2960
- return state->pending;
2984
+ return false;
2961
2985
  }
2962
2986
 
2963
2987
  // sync thread state after polling
@@ -2974,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
2974
2998
  static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
2975
2999
  struct ggml_threadpool * threadpool = state->threadpool;
2976
3000
 
2977
- // Skip polling for unused threads
2978
- if (!ggml_graph_compute_thread_active(state)) {
2979
- return state->pending;
2980
- }
2981
-
2982
3001
  // This seems to make 0 ... 100 a decent range for polling level across modern processors.
2983
3002
  // Perhaps, we can adjust it dynamically based on load and things.
2984
3003
  const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3040,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
3040
3059
  ggml_graph_compute_check_for_work(state);
3041
3060
  if (state->pending) {
3042
3061
  state->pending = false;
3043
-
3044
3062
  ggml_graph_compute_thread(state);
3045
3063
  }
3046
3064
  }
@@ -3055,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
3055
3073
 
3056
3074
  ggml_mutex_lock(&threadpool->mutex);
3057
3075
 
3058
- GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
3076
+ // Update the number of active threads and the graph count
3077
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
3078
+ n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
3059
3079
 
3060
- // Update the number of active threads
3061
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3080
+ GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
3062
3081
 
3063
3082
  // Indicate the graph is ready to be processed
3064
3083
  // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3065
- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
3084
+ atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
3066
3085
 
3067
3086
  if (threadpool->pause) {
3068
3087
  // Update main thread prio and affinity to match the threadpool settings
@@ -3100,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3100
3119
  threadpool->pause = tpp->paused;
3101
3120
  threadpool->abort = -1;
3102
3121
  threadpool->workers = NULL;
3103
- threadpool->n_threads_max = tpp->n_threads;
3104
- threadpool->n_threads_cur = tpp->n_threads;
3122
+ threadpool->n_threads = tpp->n_threads;
3105
3123
  threadpool->poll = tpp->poll;
3106
3124
  threadpool->prio = tpp->prio;
3107
3125
  threadpool->ec = GGML_STATUS_SUCCESS;
@@ -3196,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3196
3214
  {
3197
3215
  // update the number of threads from the actual number of threads that we got from OpenMP
3198
3216
  n_threads = omp_get_num_threads();
3199
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3217
+ atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
3200
3218
  }
3201
3219
 
3202
3220
  // Apply thread CPU mask and priority
@@ -3209,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3209
3227
  ggml_graph_compute_thread(&threadpool->workers[ith]);
3210
3228
  }
3211
3229
  } else {
3212
- atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
3230
+ atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
3213
3231
  ggml_graph_compute_thread(&threadpool->workers[0]);
3214
3232
  }
3215
3233
  #else
3216
- if (n_threads > threadpool->n_threads_max) {
3217
- GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
3218
- n_threads = threadpool->n_threads_max;
3234
+ if (n_threads > threadpool->n_threads) {
3235
+ GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
3236
+ n_threads = threadpool->n_threads;
3219
3237
  }
3220
3238
 
3221
3239
  // Kick all threads to start the new graph
@@ -3455,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
3455
3473
  #endif
3456
3474
  }
3457
3475
 
3476
+ int ggml_cpu_get_rvv_vlen(void) {
3477
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
3478
+ return ggml_riscv_arch_features.rvv_vlen;
3479
+ #else
3480
+ return 0;
3481
+ #endif
3482
+ }
3483
+
3458
3484
  int ggml_cpu_has_f16c(void) {
3459
3485
  #if defined(__F16C__)
3460
3486
  return 1;
@@ -3621,6 +3647,10 @@ void ggml_cpu_init(void) {
3621
3647
  ggml_init_arm_arch_features();
3622
3648
  #endif
3623
3649
 
3650
+ #if defined(__riscv)
3651
+ ggml_init_riscv_arch_features();
3652
+ #endif
3653
+
3624
3654
  is_first_call = false;
3625
3655
  }
3626
3656
 
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
583
583
  if (ggml_cpu_has_riscv_v()) {
584
584
  features.push_back({ "RISCV_V", "1" });
585
585
  }
586
+ if (ggml_cpu_get_rvv_vlen() > 0) {
587
+ static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
588
+ features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
589
+ }
586
590
  if (ggml_cpu_has_vsx()) {
587
591
  features.push_back({ "VSX", "1" });
588
592
  }
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2169
2169
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2170
2170
 
2171
2171
  if (cur->type == GGML_TYPE_Q4_0) {
2172
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2172
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2173
+ || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
2173
2174
  if (cur->ne[1] % 8 == 0) {
2174
2175
  return &q4_0_8x8_q8_0;
2175
2176
  }
@@ -313,6 +313,7 @@ extern "C" {
313
313
  bool check_tensors; // validate model tensor data
314
314
  bool use_extra_bufts; // use extra buffer types (used for weight repacking)
315
315
  bool no_host; // bypass host buffer allowing extra buffers to be used
316
+ bool no_alloc; // only load metadata and simulate memory allocations
316
317
  };
317
318
 
318
319
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -466,10 +467,24 @@ extern "C" {
466
467
  // Frees all allocated memory
467
468
  LLAMA_API void llama_free(struct llama_context * ctx);
468
469
 
470
+ // fits mparams and cparams to free device memory (assumes system memory is unlimited)
471
+ // returns true if the parameters could be successfully modified to fit device memory
472
+ // this function is NOT thread safe because it modifies the global llama logger state
473
+ LLAMA_API bool llama_params_fit(
474
+ const char * path_model,
475
+ struct llama_model_params * mparams,
476
+ struct llama_context_params * cparams,
477
+ float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
478
+ struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
479
+ size_t margin, // margin of memory to leave per device in bytes
480
+ uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
481
+ enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
482
+
469
483
  LLAMA_API int64_t llama_time_us(void);
470
484
 
471
485
  LLAMA_API size_t llama_max_devices(void);
472
486
  LLAMA_API size_t llama_max_parallel_sequences(void);
487
+ LLAMA_API size_t llama_max_tensor_buft_overrides(void);
473
488
 
474
489
  LLAMA_API bool llama_supports_mmap (void);
475
490
  LLAMA_API bool llama_supports_mlock (void);
@@ -1354,7 +1369,9 @@ extern "C" {
1354
1369
 
1355
1370
  // Set callback for all future logging events.
1356
1371
  // If this is not called, or NULL is supplied, everything is output on stderr.
1357
- LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1372
+ // The logger state is global so these functions are NOT thread safe.
1373
+ LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1374
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1358
1375
 
1359
1376
  //
1360
1377
  // Performance utils
@@ -67,7 +67,7 @@ add_library(llama
67
67
  models/gemma-embedding.cpp
68
68
  models/gemma.cpp
69
69
  models/gemma2-iswa.cpp
70
- models/gemma3-iswa.cpp
70
+ models/gemma3.cpp
71
71
  models/gemma3n-iswa.cpp
72
72
  models/glm4-moe.cpp
73
73
  models/glm4.cpp
@@ -139,6 +139,7 @@ add_library(llama
139
139
  set_target_properties(llama PROPERTIES
140
140
  VERSION ${LLAMA_INSTALL_VERSION}
141
141
  SOVERSION 0
142
+ MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
142
143
  )
143
144
 
144
145
  target_include_directories(llama PRIVATE .)