@fugood/llama.node 1.4.15 → 1.6.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lib/binding.ts +1 -5
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +2 -2
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +76 -61
  6. package/src/LlamaContext.cpp +20 -32
  7. package/src/llama.cpp/common/CMakeLists.txt +12 -0
  8. package/src/llama.cpp/common/arg.cpp +20 -0
  9. package/src/llama.cpp/common/chat-parser.cpp +3 -3
  10. package/src/llama.cpp/common/chat-parser.h +4 -4
  11. package/src/llama.cpp/common/chat.cpp +289 -34
  12. package/src/llama.cpp/common/chat.h +32 -20
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +31 -25
  15. package/src/llama.cpp/common/download.cpp +19 -14
  16. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  17. package/src/llama.cpp/common/jinja/caps.h +24 -0
  18. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  19. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  20. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  21. package/src/llama.cpp/common/jinja/parser.h +21 -0
  22. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  23. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  24. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  25. package/src/llama.cpp/common/jinja/string.h +58 -0
  26. package/src/llama.cpp/common/jinja/utils.h +49 -0
  27. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  28. package/src/llama.cpp/common/jinja/value.h +464 -0
  29. package/src/llama.cpp/common/json-partial.h +1 -0
  30. package/src/llama.cpp/common/sampling.cpp +52 -19
  31. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  34. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  36. package/src/llama.cpp/include/llama-cpp.h +3 -1
  37. package/src/llama.cpp/include/llama.h +29 -2
  38. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  39. package/src/llama.cpp/src/llama-adapter.h +1 -3
  40. package/src/llama.cpp/src/llama-context.cpp +232 -144
  41. package/src/llama.cpp/src/llama-context.h +10 -0
  42. package/src/llama.cpp/src/llama-cparams.h +2 -0
  43. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  44. package/src/llama.cpp/src/llama-hparams.h +38 -1
  45. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  46. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  47. package/src/llama.cpp/src/llama-mmap.cpp +5 -1
  48. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  49. package/src/llama.cpp/src/llama-model.cpp +5 -1
  50. package/src/llama.cpp/src/llama-model.h +3 -2
  51. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
@@ -630,10 +630,11 @@ extern "C" {
630
630
 
631
631
  // this tensor...
632
632
  enum ggml_tensor_flag {
633
- GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
634
- GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
635
- GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
636
- GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
633
+ GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
634
+ GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
635
+ GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
636
+ GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
637
+ GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
637
638
  };
638
639
 
639
640
  enum ggml_tri_type {
@@ -2577,11 +2578,42 @@ extern "C" {
2577
2578
  struct ggml_tensor * grad,
2578
2579
  struct ggml_tensor * sgd_params); // alpha, weight decay
2579
2580
 
2581
+ // build forward mutiple tensors and select one of them for computing
2582
+ // this is useful for creating graphs that have constant topology but compute different things based on the input
2583
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18550
2580
2584
  //
2581
- // automatic differentiation
2585
+ // nodes:
2586
+ // | - build forward into the graph but do not compute
2587
+ // c - build forward into the graph and compute
2582
2588
  //
2589
+ // | | ... c ... |
2590
+ // | | ... c ... |
2591
+ // | | ... c ... |
2592
+ // [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
2593
+ // c
2594
+ // c
2595
+ //
2596
+ // example:
2597
+ // struct ggml_tensor * curs[3];
2598
+ //
2599
+ // curs[0] = compute0(...);
2600
+ // curs[1] = compute1(...);
2601
+ // curs[2] = compute2(...);
2602
+ //
2603
+ // int idx = select_branch(some_input);
2604
+ //
2605
+ // struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
2606
+ //
2607
+ GGML_API struct ggml_tensor * ggml_build_forward_select(
2608
+ struct ggml_cgraph * cgraph,
2609
+ struct ggml_tensor ** tensors,
2610
+ int n_tensors,
2611
+ int idx);
2612
+
2613
+ GGML_API void ggml_build_forward_expand(
2614
+ struct ggml_cgraph * cgraph,
2615
+ struct ggml_tensor * tensor);
2583
2616
 
2584
- GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2585
2617
  GGML_API void ggml_build_backward_expand(
2586
2618
  struct ggml_context * ctx, // context for gradient computation
2587
2619
  struct ggml_cgraph * cgraph,
@@ -2613,7 +2645,7 @@ extern "C" {
2613
2645
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2614
2646
 
2615
2647
  // dump the graph into a file using the dot format
2616
- GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
2648
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
2617
2649
 
2618
2650
  // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2619
2651
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
@@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2943
2943
  continue;
2944
2944
  }
2945
2945
 
2946
+ if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
2947
+ continue;
2948
+ }
2949
+
2946
2950
  ggml_compute_forward(&params, node);
2947
2951
 
2948
2952
  if (state->ith == 0 && cplan->abort_callback &&
@@ -7,10 +7,9 @@
7
7
  #include "unary-ops.h"
8
8
  #include "vec.h"
9
9
 
10
- #include <cfloat>
11
10
  #include <algorithm>
11
+ #include <cfloat>
12
12
  #include <cmath>
13
- #include <functional>
14
13
 
15
14
  // ggml_compute_forward_dup
16
15
 
@@ -7110,12 +7109,13 @@ void ggml_compute_forward_conv_2d_dw(
7110
7109
  }
7111
7110
  }
7112
7111
 
7113
- // ggml_compute_forward_pool_1d_sk_p0
7114
-
7115
- static void ggml_compute_forward_pool_1d_sk_p0(
7112
+ // ggml_compute_forward_pool_1d_ksp
7113
+ static void ggml_compute_forward_pool_1d_ksp(
7116
7114
  const ggml_compute_params * params,
7117
7115
  const ggml_op_pool op,
7118
7116
  const int k,
7117
+ const int s,
7118
+ const int p,
7119
7119
  ggml_tensor * dst) {
7120
7120
 
7121
7121
  const ggml_tensor * src = dst->src[0];
@@ -7126,39 +7126,56 @@ static void ggml_compute_forward_pool_1d_sk_p0(
7126
7126
  return;
7127
7127
  }
7128
7128
 
7129
- const char * cdata = (const char *)src->data;
7130
- const char * const data_end = cdata + ggml_nbytes(src);
7131
- float * drow = (float *)dst->data;
7129
+ const int64_t IW = src->ne[0];
7130
+ const int64_t OW = dst->ne[0];
7132
7131
 
7133
- const int64_t rs = dst->ne[0];
7132
+ const int64_t nr = ggml_nrows(src);
7134
7133
 
7135
- while (cdata < data_end) {
7136
- const void * srow = (const void *)cdata;
7137
- int j = 0;
7138
- for (int64_t i = 0; i < rs; ++i) {
7134
+ for (int64_t ir = 0; ir < nr; ++ir) {
7135
+ const char * srow_bytes = (const char *) src->data + ir * src->nb[1];
7136
+ float * drow = (float *) (( char *) dst->data + ir * dst->nb[1]);
7137
+
7138
+ for (int64_t ow = 0; ow < OW; ++ow) {
7139
+ float res = 0;
7139
7140
  switch (op) {
7140
- case GGML_OP_POOL_AVG: drow[i] = 0; break;
7141
- case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
7141
+ case GGML_OP_POOL_AVG: res = 0.0f; break;
7142
+ case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
7142
7143
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7143
7144
  }
7145
+
7146
+ int count = 0;
7147
+ const int base = (int) ow * s - p;
7148
+
7144
7149
  for (int ki = 0; ki < k; ++ki) {
7145
- const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
7150
+ const int j = base + ki;
7151
+ if (j < 0 || j >= (int) IW) {
7152
+ continue;
7153
+ }
7154
+
7155
+ float v;
7156
+ if (src->type == GGML_TYPE_F32) {
7157
+ v = ((const float *) srow_bytes)[j];
7158
+ } else {
7159
+ v = GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) srow_bytes)[j]);
7160
+ }
7161
+
7146
7162
  switch (op) {
7147
- case GGML_OP_POOL_AVG: drow[i] += srow_j; break;
7148
- case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break;
7149
- case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7163
+ case GGML_OP_POOL_AVG: res += v; break;
7164
+ case GGML_OP_POOL_MAX: res = std::max(v, res); break;
7165
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7150
7166
  }
7151
- ++j;
7167
+
7168
+ ++count;
7152
7169
  }
7170
+
7153
7171
  switch (op) {
7154
- case GGML_OP_POOL_AVG: drow[i] /= k; break;
7155
- case GGML_OP_POOL_MAX: break;
7172
+ case GGML_OP_POOL_AVG: res = (count > 0) ? (res / count) : 0.0f; break;
7173
+ case GGML_OP_POOL_MAX: break;
7156
7174
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7157
7175
  }
7158
- }
7159
7176
 
7160
- cdata += src->nb[1];
7161
- drow += rs;
7177
+ drow[ow] = res;
7178
+ }
7162
7179
  }
7163
7180
  }
7164
7181
 
@@ -7173,10 +7190,8 @@ void ggml_compute_forward_pool_1d(
7173
7190
  const int k0 = opts[1];
7174
7191
  const int s0 = opts[2];
7175
7192
  const int p0 = opts[3];
7176
- GGML_ASSERT(p0 == 0); // padding not supported
7177
- GGML_ASSERT(k0 == s0); // only s = k supported
7178
7193
 
7179
- ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
7194
+ ggml_compute_forward_pool_1d_ksp(params, op, k0, s0, p0, dst);
7180
7195
  }
7181
7196
 
7182
7197
  // ggml_compute_forward_pool_2d
@@ -7194,6 +7209,7 @@ void ggml_compute_forward_pool_2d(
7194
7209
  }
7195
7210
 
7196
7211
  const int32_t * opts = (const int32_t *)dst->op_params;
7212
+
7197
7213
  ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
7198
7214
  const int k0 = opts[1];
7199
7215
  const int k1 = opts[2];
@@ -7217,11 +7233,13 @@ void ggml_compute_forward_pool_2d(
7217
7233
  while (cdata < data_end) {
7218
7234
  for (int oy = 0; oy < py; ++oy) {
7219
7235
  float * const drow = dplane + oy * px;
7236
+ float * const out = drow;
7237
+
7220
7238
  for (int ox = 0; ox < px; ++ox) {
7221
- float * const out = drow + ox;
7239
+ float res = 0;
7222
7240
  switch (op) {
7223
- case GGML_OP_POOL_AVG: *out = 0; break;
7224
- case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
7241
+ case GGML_OP_POOL_AVG: res = 0; break;
7242
+ case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
7225
7243
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7226
7244
  }
7227
7245
 
@@ -7229,24 +7247,32 @@ void ggml_compute_forward_pool_2d(
7229
7247
  const int iy = offset1 + oy * s1;
7230
7248
 
7231
7249
  for (int ky = 0; ky < k1; ++ky) {
7232
- if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
7250
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) {
7251
+ continue;
7252
+ }
7253
+
7233
7254
  const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
7234
7255
  for (int kx = 0; kx < k0; ++kx) {
7235
7256
  int j = ix + kx;
7236
- if (j < 0 || j >= src->ne[0]) continue;
7257
+ if (j < 0 || j >= src->ne[0]) {
7258
+ continue;
7259
+ }
7260
+
7237
7261
  const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
7238
7262
  switch (op) {
7239
- case GGML_OP_POOL_AVG: *out += srow_j; break;
7240
- case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break;
7263
+ case GGML_OP_POOL_AVG: res += srow_j; break;
7264
+ case GGML_OP_POOL_MAX: res = std::max(srow_j, res); break;
7241
7265
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7242
7266
  }
7243
7267
  }
7244
7268
  }
7245
7269
  switch (op) {
7246
- case GGML_OP_POOL_AVG: *out /= ka; break;
7247
- case GGML_OP_POOL_MAX: break;
7270
+ case GGML_OP_POOL_AVG: res /= ka; break;
7271
+ case GGML_OP_POOL_MAX: break;
7248
7272
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
7249
7273
  }
7274
+
7275
+ out[ox] = res;
7250
7276
  }
7251
7277
  }
7252
7278
 
@@ -654,6 +654,14 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
654
654
  vec_extract(x[0], 2) + \
655
655
  vec_extract(x[0], 3); \
656
656
  }
657
+ #define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
658
+ { \
659
+ vector float v = vec_add(vec_add(s0, s1), \
660
+ vec_add(s2, s3)); \
661
+ v = vec_add(v, vec_sld(v, v, 8)); \
662
+ v = vec_add(v, vec_sld(v, v, 4)); \
663
+ res += (ggml_float) vec_extract(v, 0); \
664
+ }
657
665
 
658
666
  #define GGML_F32_VEC GGML_F32x4
659
667
  #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
@@ -690,6 +698,29 @@ static inline unsigned char ggml_endian_byte(int i) {
690
698
  r[i - GGML_ENDIAN_BYTE(0)]), \
691
699
  0, p - GGML_F16_EPR)
692
700
 
701
+ //BF16 POWER9
702
+ #define GGML_BF16_STEP 16
703
+ #define GGML_BF16_EPR 8
704
+
705
+ #define GGML_BF16x8 vector unsigned short
706
+ #define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
707
+ #define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
708
+
709
+ #define GGML_BF16_VEC GGML_BF16x8
710
+ #define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
711
+ #define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
712
+ #if defined(__LITTLE_ENDIAN__)
713
+ #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
714
+ #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
715
+ #else
716
+ #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
717
+ #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
718
+ #endif
719
+ #define GGML_BF16_FMA_LO(acc, x, y) \
720
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
721
+ #define GGML_BF16_FMA_HI(acc, x, y) \
722
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
723
+
693
724
  #elif defined(__wasm_simd128__)
694
725
 
695
726
  #define GGML_SIMD
@@ -237,6 +237,24 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
237
237
  sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
238
238
 
239
239
  #endif
240
+ #if defined(__POWER9_VECTOR__)
241
+ const int np = (n & ~(GGML_BF16_STEP - 1));
242
+ if (np > 0) {
243
+ GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO};
244
+ for (; i < np; i += GGML_BF16_STEP) {
245
+ GGML_BF16_VEC vx0 = GGML_BF16_VEC_LOAD(x + i);
246
+ GGML_BF16_VEC vx1 = GGML_BF16_VEC_LOAD(x + i + 8);
247
+ GGML_BF16_VEC vy0 = GGML_BF16_VEC_LOAD(y + i);
248
+ GGML_BF16_VEC vy1 = GGML_BF16_VEC_LOAD(y + i + 8);
249
+ GGML_BF16_FMA_LO(sum[0], vx0, vy0);
250
+ GGML_BF16_FMA_HI(sum[1], vx0, vy0);
251
+ GGML_BF16_FMA_LO(sum[2], vx1, vy1);
252
+ GGML_BF16_FMA_HI(sum[3], vx1, vy1);
253
+ }
254
+ GGML_F32x4_REDUCE_4(sumf, sum[0], sum[1], sum[2], sum[3]);
255
+ }
256
+ #endif
257
+
240
258
  for (; i < n; ++i) {
241
259
  sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
242
260
  GGML_BF16_TO_FP32(y[i]));
@@ -21,7 +21,9 @@ struct llama_sampler_deleter {
21
21
  };
22
22
 
23
23
  struct llama_adapter_lora_deleter {
24
- void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
24
+ void operator()(llama_adapter_lora *) {
25
+ // llama_adapter_lora_free is deprecated
26
+ }
25
27
  };
26
28
 
27
29
  typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
@@ -646,7 +646,8 @@ extern "C" {
646
646
 
647
647
  // Manually free a LoRA adapter
648
648
  // NOTE: loaded adapters will be free when the associated model is deleted
649
- LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
649
+ LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
650
+ "adapters are now freed together with the associated model");
650
651
 
651
652
  // Get the invocation tokens if the current lora is an alora
652
653
  LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
@@ -1255,7 +1256,6 @@ extern "C" {
1255
1256
  // [EXPERIMENTAL]
1256
1257
  // attach a sampler to the context
1257
1258
  // note: prefer initializing the context with llama_context_params.samplers when possible
1258
- // note: changing the samplers of a context can cause graph reallocations and degraded performance
1259
1259
  LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1260
1260
 
1261
1261
  // mirror of llama_sampler_i:
@@ -1395,6 +1395,33 @@ extern "C" {
1395
1395
  const char ** seq_breakers,
1396
1396
  size_t num_breakers);
1397
1397
 
1398
+ /// adaptive-p: select tokens near a configurable target probability over time.
1399
+ ///
1400
+ /// the adaptive-p sampler transforms the token probability distribution to favor tokens
1401
+ /// that fall near a user-configurable probability target.
1402
+ ///
1403
+ /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
1404
+ /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
1405
+ /// adapted target probability at each sampling step, thus maintaining the desired target
1406
+ /// probability over time.
1407
+ ///
1408
+ /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
1409
+ /// in the sampler chain (like mirostat, dist, greedy).
1410
+ ///
1411
+ /// only mild truncation before this sampler is recommended. we suggest applying min-p
1412
+ /// before adaptive-p as the only other active sampler in the chain.
1413
+ ///
1414
+ /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
1415
+ /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
1416
+ /// @param seed RNG seed
1417
+ ///
1418
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
1419
+ ///
1420
+ LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
1421
+ float target,
1422
+ float decay,
1423
+ uint32_t seed);
1424
+
1398
1425
  LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1399
1426
  int32_t n_vocab,
1400
1427
  int32_t n_logit_bias,
@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
146
146
  return nullptr;
147
147
  }
148
148
 
149
- static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
149
+ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150
150
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
151
 
152
- llama_model & model = adapter.model;
153
-
154
152
  ggml_context * ctx_init;
155
153
  gguf_init_params meta_gguf_params = {
156
154
  /* .no_alloc = */ true,
@@ -413,17 +411,17 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
413
411
  }
414
412
  }
415
413
 
416
- // update number of nodes used
417
- model.n_lora_nodes += adapter.get_n_nodes();
414
+ // register adapter with model
415
+ model.loras.insert(&adapter);
418
416
 
419
417
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
420
418
  }
421
419
 
422
420
  llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
423
- llama_adapter_lora * adapter = new llama_adapter_lora(*model);
421
+ llama_adapter_lora * adapter = new llama_adapter_lora();
424
422
 
425
423
  try {
426
- llama_adapter_lora_init_impl(path_lora, *adapter);
424
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
427
425
  return adapter;
428
426
  } catch (const std::exception & err) {
429
427
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -473,12 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
473
471
  return snprintf(buf, buf_size, "%s", it->second.c_str());
474
472
  }
475
473
 
476
- void llama_adapter_lora_free(llama_adapter_lora * adapter) {
477
- // update number of nodes used
478
- GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
479
- adapter->model.n_lora_nodes -= adapter->get_n_nodes();
480
-
481
- delete adapter;
474
+ void llama_adapter_lora_free(llama_adapter_lora *) {
475
+ // deprecated: adapters are freed by llama_model's destructor
482
476
  }
483
477
 
484
478
  uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
- llama_model & model;
63
-
64
62
  // map tensor name to lora_a_b
65
63
  std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
66
64
 
@@ -75,7 +73,7 @@ struct llama_adapter_lora {
75
73
  // activated lora (aLoRA)
76
74
  std::vector<llama_token> alora_invocation_tokens;
77
75
 
78
- llama_adapter_lora(llama_model & model) : model(model) {}
76
+ llama_adapter_lora() = default;
79
77
  ~llama_adapter_lora() = default;
80
78
 
81
79
  llama_adapter_lora_weight * get_weight(ggml_tensor * w);