@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -20,6 +20,9 @@
20
20
 
21
21
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
22
22
 
23
+ // Work buffer size for im2col operations in CONV2D
24
+ #define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
25
+
23
26
  #ifdef __cplusplus
24
27
  extern "C" {
25
28
  #endif
@@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
65
68
  void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
69
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
70
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
+ void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68
72
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
73
  void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
74
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
107
111
  void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
108
112
  void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
109
113
  void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
114
+ void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
110
115
 
111
116
  #ifdef __cplusplus
112
117
  }
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
189
189
  #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
190
190
  #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
191
191
  #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
192
- #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
192
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
193
193
  #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
194
194
  #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
195
195
  #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
37
37
  for (int i = 0; i < np; i += ggml_f32_step) {
38
38
  ax1 = GGML_F32_VEC_LOAD(x + i);
39
39
  ay1 = GGML_F32_VEC_LOAD(y + i);
40
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
40
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
41
41
 
42
42
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
43
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
- sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
44
+ sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
45
45
 
46
46
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
47
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
- sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
48
+ sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
49
49
 
50
50
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
51
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
- sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
52
+ sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
53
53
 
54
54
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
55
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
- sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
56
+ sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
57
57
 
58
58
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
59
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
- sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
60
+ sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
61
61
 
62
62
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
63
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
- sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
64
+ sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
65
65
 
66
66
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
67
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
- sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
68
+ sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
69
69
  }
70
70
  // leftovers
71
71
  // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
73
73
  for (int i = np; i < np2; i += ggml_f32_epr) {
74
74
  ax1 = GGML_F32_VEC_LOAD(x + i);
75
75
  ay1 = GGML_F32_VEC_LOAD(y + i);
76
- sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
76
+ sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
77
77
  }
78
78
  // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
79
  if (np2 < n) {
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
221
221
  for (int i = np; i < n; ++i) {
222
222
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
223
223
  }
224
+
225
+ // if you hit this, you are likely running outside the FP range
226
+ assert(!isnan(sumf) && !isinf(sumf));
224
227
  #else
225
228
  for (int i = 0; i < n; ++i) {
226
229
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
163
163
 
164
164
  ax1 = GGML_F32_VEC_LOAD(x + i);
165
165
  ay1 = GGML_F32_VEC_LOAD(y + i);
166
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
166
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
167
167
 
168
168
  GGML_F32_VEC_STORE(y + i, ay1);
169
169
 
170
170
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
171
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
- ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
172
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
173
173
 
174
174
  GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
175
 
176
176
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
177
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
- ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
178
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
179
179
 
180
180
  GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
181
 
182
182
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
183
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
- ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
184
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
185
185
 
186
186
  GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
187
 
188
188
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
189
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
- ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
190
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
191
191
 
192
192
  GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
193
 
194
194
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
195
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
- ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
196
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
197
197
 
198
198
  GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
199
 
200
200
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
201
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
- ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
202
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
203
203
 
204
204
  GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
205
 
206
206
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
207
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
- ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
208
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
209
209
 
210
210
  GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
211
211
  }
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
215
215
  for (int i = np; i < np2; i += ggml_f32_epr) {
216
216
  ax1 = GGML_F32_VEC_LOAD(x + i);
217
217
  ay1 = GGML_F32_VEC_LOAD(y + i);
218
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
218
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
219
219
 
220
220
  GGML_F32_VEC_STORE(y + i, ay1);
221
221
  }
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
351
351
  #endif
352
352
  }
353
353
 
354
+ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
355
+ #if defined(GGML_USE_ACCELERATE)
356
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
357
+ #elif defined(GGML_SIMD)
358
+ #if defined(__ARM_FEATURE_SVE)
359
+ // scalar ; TODO: Write SVE code
360
+ for (int i = 0; i < n; ++i) {
361
+ y[i] = x[i]*s + b;
362
+ }
363
+ #else
364
+ const int np = (n & ~(GGML_F32_STEP - 1));
365
+
366
+ GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
367
+ GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
368
+
369
+ GGML_F32_VEC ay[GGML_F32_ARR];
370
+
371
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
372
+ for (int j = 0; j < GGML_F32_ARR; j++) {
373
+ ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
374
+ ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
375
+
376
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
377
+ }
378
+ }
379
+
380
+ // leftovers
381
+ for (int i = np; i < n; ++i) {
382
+ y[i] = x[i]*s + b;
383
+ }
384
+ #endif
385
+ #else
386
+ // scalar
387
+ for (int i = 0; i < n; ++i) {
388
+ y[i] = x[i]*s + b;
389
+ }
390
+ #endif
391
+ }
392
+
354
393
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
355
394
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
356
395
  #if defined(GGML_USE_ACCELERATE)
@@ -959,6 +998,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
959
998
  }
960
999
  }
961
1000
 
1001
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1002
+ for (int i = 0; i < n; ++i) {
1003
+ float xi = x[i];
1004
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1005
+ }
1006
+ }
1007
+
1008
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1009
+ for (int i = 0; i < n; ++i) {
1010
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1011
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1012
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1013
+ }
1014
+ }
1015
+
1016
+ #ifdef GGML_GELU_QUICK_FP16
1017
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1018
+ uint16_t t;
1019
+ for (int i = 0; i < n; ++i) {
1020
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1021
+ memcpy(&t, &fp16, sizeof(uint16_t));
1022
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1023
+ }
1024
+ }
1025
+ #else
1026
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1027
+ for (int i = 0; i < n; ++i) {
1028
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1029
+ }
1030
+ }
1031
+ #endif
1032
+
1033
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1034
+ const uint16_t * i16 = (const uint16_t *) x;
1035
+ for (int i = 0; i < n; ++i) {
1036
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1037
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1038
+ }
1039
+ }
1040
+
962
1041
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
963
1042
  #ifndef GGML_USE_ACCELERATE
964
1043
  ggml_float sum = 0.0;
@@ -71,52 +71,13 @@ extern "C" {
71
71
  typedef int32_t llama_seq_id;
72
72
 
73
73
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
- };
81
-
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
74
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
120
81
  };
121
82
 
122
83
  enum llama_rope_type {
@@ -374,6 +335,9 @@ extern "C" {
374
335
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
336
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
337
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
338
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
339
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
340
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
377
341
  };
378
342
 
379
343
  // model quantization parameters
@@ -764,7 +728,7 @@ extern "C" {
764
728
  // - lazily on next llama_decode()
765
729
  // p0 < 0 : [0, p1]
766
730
  // p1 < 0 : [p0, inf)
767
- DEPRECATED(void llama_kv_self_seq_div(
731
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
768
732
  struct llama_context * ctx,
769
733
  llama_seq_id seq_id,
770
734
  llama_pos p0,
@@ -1044,6 +1008,7 @@ extern "C" {
1044
1008
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1045
1009
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1046
1010
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1011
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1047
1012
 
1048
1013
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1049
1014
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1429,6 +1394,7 @@ extern "C" {
1429
1394
 
1430
1395
  int32_t n_p_eval;
1431
1396
  int32_t n_eval;
1397
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1432
1398
  };
1433
1399
 
1434
1400
  struct llama_perf_sampler_data {