@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
98
98
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
99
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
100
100
  void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
+ void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
+ void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
+ void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
+ void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
105
 
102
106
  // Native implementations
103
107
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
120
124
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
121
125
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
122
126
  void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
127
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
128
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
129
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
130
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
123
131
 
124
132
  #if defined(__cplusplus)
125
133
  } // extern "C"
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
195
195
  sumf += (ggml_float)_mm_cvtss_f32(g);
196
196
 
197
197
  #undef LOAD
198
- #endif
198
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
199
+ size_t vl = __riscv_vsetvlmax_e32m4();
200
+
201
+ // initialize accumulators to all zeroes
202
+ vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
203
+ vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
204
+
205
+ // calculate step size
206
+ const size_t epr = __riscv_vsetvlmax_e16m2();
207
+ const size_t step = epr * 2;
208
+ const int np = (n & ~(step - 1));
209
+
210
+ // unroll by 2
211
+ for (; i < np; i += step) {
212
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
213
+ vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
214
+ vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
215
+ __asm__ __volatile__ ("" ::: "memory");
216
+
217
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
218
+ vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
219
+ vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
220
+ __asm__ __volatile__ ("" ::: "memory");
221
+ }
199
222
 
223
+ // accumulate in 1 register
224
+ vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
225
+
226
+ // leftovers
227
+ for (i = np; i < n; i += vl) {
228
+ vl = __riscv_vsetvl_e16m2(n - i);
229
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
230
+ vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
231
+ vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
232
+ }
233
+
234
+ // reduce
235
+ vl = __riscv_vsetvlmax_e32m4();
236
+ vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
237
+ sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
238
+
239
+ #endif
200
240
  for (; i < n; ++i) {
201
241
  sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
202
242
  GGML_BF16_TO_FP32(y[i]));
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
224
224
  }
225
225
  GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
226
  GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
- #elif defined(__riscv_v_intrinsic)
228
- // todo: RVV impl
229
- for (int i = 0; i < n; ++i) {
230
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
231
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232
- }
233
- }
227
+
228
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229
+ size_t vl = __riscv_vsetvlmax_e32m4();
230
+
231
+ // initialize accumulators to all zeroes
232
+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233
+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234
+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235
+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
236
+
237
+ // calculate step size
238
+ const size_t epr = __riscv_vsetvlmax_e16m2();
239
+ const size_t step = epr * 2;
240
+ const int np = (n & ~(step - 1));
241
+
242
+ // unroll by 2 along the row dimension
243
+ for (int i = 0; i < np; i += step) {
244
+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245
+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246
+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247
+ vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248
+ vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249
+
250
+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251
+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252
+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253
+ vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254
+ vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
255
+ }
256
+
257
+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258
+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
259
+
260
+ // leftovers
261
+ for (int i = np; i < n; i += vl) {
262
+ vl = __riscv_vsetvl_e16m2(n - i);
263
+ vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
266
+
267
+ vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268
+ vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
269
+ }
270
+
271
+ // reduce
272
+ vl = __riscv_vsetvlmax_e32m2();
273
+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274
+ __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
275
+ vl = __riscv_vsetvlmax_e32m1();
276
+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277
+ __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
278
+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
279
+ acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
280
+
281
+ vl = __riscv_vsetvlmax_e32m2();
282
+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283
+ __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284
+ vl = __riscv_vsetvlmax_e32m1();
285
+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286
+ __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
287
+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
288
+ acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
289
+ sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
290
+ sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291
+
234
292
  #else
235
293
  const int np = (n & ~(GGML_F16_STEP - 1));
236
294
 
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
475
533
  }
476
534
  np = n;
477
535
  #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
478
- const int np = n;
479
- _Float16 hv = (_Float16)v;
480
- for (int i = 0, avl; i < n; i += avl) {
481
- avl = __riscv_vsetvl_e16m8(n - i);
482
- vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
483
- vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
484
- vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
485
- __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
536
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
537
+ const _Float16 scale = *(const _Float16*)(&s);
538
+
539
+ // calculate step size
540
+ const int epr = __riscv_vsetvlmax_e16m4();
541
+ const int step = epr * 2;
542
+ int np = (n & ~(step - 1));
543
+
544
+ // unroll by 2
545
+ for (int i = 0; i < np; i += step) {
546
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
547
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
548
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
549
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
550
+ __asm__ __volatile__ ("" ::: "memory");
551
+
552
+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
553
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
554
+ ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
555
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
556
+ __asm__ __volatile__ ("" ::: "memory");
557
+ }
558
+
559
+ // leftovers
560
+ int vl;
561
+ for (int i = np; i < n; i += vl) {
562
+ vl = __riscv_vsetvl_e16m4(n - i);
563
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
564
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
565
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
566
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
486
567
  }
568
+ np = n;
487
569
  #elif defined(GGML_SIMD)
488
570
  const int np = (n & ~(GGML_F16_STEP - 1));
489
571
 
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
724
806
  svst1_f16(pg, (__fp16 *)(y + np), out);
725
807
  }
726
808
  #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
727
- for (int i = 0, vl; i < n; i += vl) {
728
- vl = __riscv_vsetvl_e16m2(n - i);
729
- vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
730
- vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
731
- vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
732
- vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
733
- __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
809
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
810
+ const _Float16 scale = *(const _Float16*)(&s);
811
+
812
+ // calculate step size
813
+ const int epr = __riscv_vsetvlmax_e16m4();
814
+ const int step = epr * 2;
815
+ const int np = (n & ~(step - 1));
816
+
817
+ // unroll by 2
818
+ for (int i = 0; i < np; i += step) {
819
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
820
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
821
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
822
+ __asm__ __volatile__ ("" ::: "memory");
823
+
824
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
825
+ ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
826
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
827
+ __asm__ __volatile__ ("" ::: "memory");
828
+ }
829
+
830
+ // leftovers
831
+ int vl;
832
+ for (int i = np; i < n; i += vl) {
833
+ vl = __riscv_vsetvl_e16m4(n - i);
834
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
835
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
836
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
734
837
  }
735
838
  #elif defined(GGML_SIMD)
736
839
  const int np = (n & ~(GGML_F16_STEP - 1));
@@ -313,6 +313,7 @@ extern "C" {
313
313
  bool check_tensors; // validate model tensor data
314
314
  bool use_extra_bufts; // use extra buffer types (used for weight repacking)
315
315
  bool no_host; // bypass host buffer allowing extra buffers to be used
316
+ bool no_alloc; // only load metadata and simulate memory allocations
316
317
  };
317
318
 
318
319
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -466,10 +467,24 @@ extern "C" {
466
467
  // Frees all allocated memory
467
468
  LLAMA_API void llama_free(struct llama_context * ctx);
468
469
 
470
+ // fits mparams and cparams to free device memory (assumes system memory is unlimited)
471
+ // returns true if the parameters could be successfully modified to fit device memory
472
+ // this function is NOT thread safe because it modifies the global llama logger state
473
+ LLAMA_API bool llama_params_fit(
474
+ const char * path_model,
475
+ struct llama_model_params * mparams,
476
+ struct llama_context_params * cparams,
477
+ float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
478
+ struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
479
+ size_t margin, // margin of memory to leave per device in bytes
480
+ uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
481
+ enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
482
+
469
483
  LLAMA_API int64_t llama_time_us(void);
470
484
 
471
485
  LLAMA_API size_t llama_max_devices(void);
472
486
  LLAMA_API size_t llama_max_parallel_sequences(void);
487
+ LLAMA_API size_t llama_max_tensor_buft_overrides(void);
473
488
 
474
489
  LLAMA_API bool llama_supports_mmap (void);
475
490
  LLAMA_API bool llama_supports_mlock (void);
@@ -1354,7 +1369,9 @@ extern "C" {
1354
1369
 
1355
1370
  // Set callback for all future logging events.
1356
1371
  // If this is not called, or NULL is supplied, everything is output on stderr.
1357
- LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1372
+ // The logger state is global so these functions are NOT thread safe.
1373
+ LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1374
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1358
1375
 
1359
1376
  //
1360
1377
  // Performance utils