@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -9,6 +9,7 @@
9
9
  #include <algorithm>
10
10
  #include <cassert>
11
11
  #include <cstring>
12
+ #include <cmath>
12
13
  #include <functional>
13
14
  #include <map>
14
15
  #include <sstream>
@@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
864
865
  default: type = LLM_TYPE_UNKNOWN;
865
866
  }
866
867
  } break;
868
+ case LLM_ARCH_GEMMA3:
869
+ {
870
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
871
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
872
+
873
+ switch (hparams.n_layer) {
874
+ case 26: type = LLM_TYPE_1B; break;
875
+ case 34: type = LLM_TYPE_4B; break;
876
+ case 48: type = LLM_TYPE_12B; break;
877
+ case 62: type = LLM_TYPE_27B; break;
878
+ default: type = LLM_TYPE_UNKNOWN;
879
+ }
880
+
881
+ hparams.f_attention_scale = type == LLM_TYPE_27B
882
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
883
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
884
+ } break;
867
885
  case LLM_ARCH_STARCODER2:
868
886
  {
869
887
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1275,7 +1293,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1275
1293
 
1276
1294
  const bool use_mmap_buffer = true;
1277
1295
 
1278
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false");
1296
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1279
1297
 
1280
1298
  // build a list of buffer types for the CPU and GPU devices
1281
1299
  pimpl->cpu_buft_list = make_cpu_buft_list(devices);
@@ -1424,6 +1442,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1424
1442
  throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
1425
1443
  }
1426
1444
 
1445
+ // skip unused tensors
1446
+ if (info.op == GGML_OP_NONE) {
1447
+ LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
1448
+ ml.n_created++;
1449
+
1450
+ return nullptr;
1451
+ }
1452
+
1427
1453
  // tensors with "bias" suffix are always used with GGML_OP_ADD
1428
1454
  ggml_op op;
1429
1455
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
@@ -2194,13 +2220,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2194
2220
  } break;
2195
2221
  case LLM_ARCH_PHI3:
2196
2222
  {
2197
- const int64_t n_embd_head = n_embd / n_head;
2198
-
2199
2223
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2200
2224
 
2201
2225
  // output
2202
2226
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2203
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
2227
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2228
+
2229
+ // if output is NULL, init from the input tok embed
2230
+ if (output == NULL) {
2231
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2232
+ }
2204
2233
 
2205
2234
  for (int i = 0; i < n_layer; ++i) {
2206
2235
  auto & layer = layers[i];
@@ -2215,8 +2244,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2215
2244
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2216
2245
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
2217
2246
 
2218
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2219
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2247
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2248
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2220
2249
  }
2221
2250
  } break;
2222
2251
  case LLM_ARCH_PHIMOE:
@@ -2443,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2443
2472
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2444
2473
  layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2445
2474
 
2475
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2476
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2477
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2478
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2479
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2480
+ }
2481
+ } break;
2482
+ case LLM_ARCH_GEMMA3:
2483
+ {
2484
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2485
+
2486
+ // output
2487
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2488
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2489
+
2490
+ for (int i = 0; i < n_layer; ++i) {
2491
+ auto & layer = layers[i];
2492
+
2493
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2494
+
2495
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2496
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2497
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2498
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2499
+
2500
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2501
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2502
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2503
+
2446
2504
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2447
2505
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2448
2506
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -3639,6 +3697,7 @@ void llama_model::print_info() const {
3639
3697
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
3640
3698
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
3641
3699
  LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
3700
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
3642
3701
  LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
3643
3702
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3644
3703
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
@@ -3830,6 +3889,10 @@ int32_t llama_model_n_head(const struct llama_model * model) {
3830
3889
  return model->hparams.n_head();
3831
3890
  }
3832
3891
 
3892
+ int32_t llama_model_n_head_kv(const struct llama_model * model) {
3893
+ return model->hparams.n_head_kv();
3894
+ }
3895
+
3833
3896
  // deprecated
3834
3897
  int32_t llama_n_ctx_train(const struct llama_model * model) {
3835
3898
  return llama_model_n_ctx_train(model);
@@ -3908,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
3908
3971
  case LLM_ARCH_PHIMOE:
3909
3972
  case LLM_ARCH_GEMMA:
3910
3973
  case LLM_ARCH_GEMMA2:
3974
+ case LLM_ARCH_GEMMA3:
3911
3975
  case LLM_ARCH_STARCODER2:
3912
3976
  case LLM_ARCH_OPENELM:
3913
3977
  case LLM_ARCH_GPTNEOX:
@@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
316
316
 
317
317
  // llama_sampler API
318
318
 
319
+ struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
320
+ return new llama_sampler {
321
+ /* .iface = */ iface,
322
+ /* .ctx = */ ctx,
323
+ };
324
+ }
325
+
319
326
  const char * llama_sampler_name(const struct llama_sampler * smpl) {
320
327
  if (!smpl->iface) {
321
328
  return "(null)";
@@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
347
354
  }
348
355
 
349
356
  if (smpl->ctx == nullptr) {
350
- return new llama_sampler {
357
+ return llama_sampler_init(
351
358
  /* .iface = */ smpl->iface,
352
- /* .ctx = */ nullptr,
353
- };
359
+ /* .ctx = */ nullptr
360
+ );
354
361
  }
355
362
 
356
363
  GGML_ABORT("the sampler does not support cloning");
@@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
472
479
  };
473
480
 
474
481
  struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
475
- return new llama_sampler {
482
+ return llama_sampler_init(
476
483
  /* .iface = */ &llama_sampler_chain_i,
477
484
  /* .ctx = */ new llama_sampler_chain {
478
485
  /* .params = */ params,
479
486
  /* .samplers = */ {},
480
487
  /* .t_sample_us = */ 0,
481
488
  /* .n_sample = */ 0,
482
- },
483
- };
489
+ }
490
+ );
484
491
  }
485
492
 
486
493
  void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
546
553
  };
547
554
 
548
555
  struct llama_sampler * llama_sampler_init_greedy() {
549
- return new llama_sampler {
556
+ return llama_sampler_init(
550
557
  /* .iface = */ &llama_sampler_greedy_i,
551
- /* .ctx = */ nullptr,
552
- };
558
+ /* .ctx = */ nullptr
559
+ );
553
560
  }
554
561
 
555
562
  // dist
@@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
608
615
 
609
616
  struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
610
617
  auto seed_cur = get_rng_seed(seed);
611
- return new llama_sampler {
618
+ return llama_sampler_init(
612
619
  /* .iface = */ &llama_sampler_dist_i,
613
620
  /* .ctx = */ new llama_sampler_dist {
614
621
  /* .seed = */ seed,
615
622
  /* .seed_cur = */ seed_cur,
616
623
  /* .rng = */ std::mt19937(seed_cur),
617
- },
618
- };
624
+ }
625
+ );
619
626
  }
620
627
 
621
628
  // softmax
@@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
638
645
  };
639
646
 
640
647
  struct llama_sampler * llama_sampler_init_softmax() {
641
- return new llama_sampler {
648
+ return llama_sampler_init(
642
649
  /* .iface = */ &llama_sampler_softmax_i,
643
- /* .ctx = */ nullptr,
644
- };
650
+ /* .ctx = */ nullptr
651
+ );
645
652
  }
646
653
 
647
654
  // top-k
@@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
678
685
  };
679
686
 
680
687
  struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
681
- return new llama_sampler {
688
+ return llama_sampler_init(
682
689
  /* .iface = */ &llama_sampler_top_k_i,
683
690
  /* .ctx = */ new llama_sampler_top_k {
684
691
  /* .k = */ k,
685
- },
686
- };
692
+ }
693
+ );
687
694
  }
688
695
 
689
696
  // top-p
@@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
744
751
  };
745
752
 
746
753
  struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
747
- return new llama_sampler {
754
+ return llama_sampler_init(
748
755
  /* .iface = */ &llama_sampler_top_p_i,
749
756
  /* .ctx = */ new llama_sampler_top_p {
750
757
  /* .p = */ p,
751
758
  /* .min_keep = */ min_keep,
752
- },
753
- };
759
+ }
760
+ );
754
761
  }
755
762
 
756
763
  // min-p
@@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
840
847
  };
841
848
 
842
849
  struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
843
- return new llama_sampler {
850
+ return llama_sampler_init(
844
851
  /* .iface = */ &llama_sampler_min_p_i,
845
852
  /* .ctx = */ new llama_sampler_min_p {
846
853
  /* .p = */ p,
847
854
  /* .min_keep = */ min_keep,
848
- },
849
- };
855
+ }
856
+ );
850
857
  }
851
858
 
852
859
  // typical
@@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
939
946
  };
940
947
 
941
948
  struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
942
- return new llama_sampler {
949
+ return llama_sampler_init(
943
950
  /* .iface = */ &llama_sampler_typical_i,
944
951
  /* .ctx = */ new llama_sampler_typical {
945
952
  /* .p = */ p,
946
953
  /* .min_keep = */ min_keep,
947
- },
948
- };
954
+ }
955
+ );
949
956
  }
950
957
 
951
958
  // temp
@@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
983
990
  };
984
991
 
985
992
  struct llama_sampler * llama_sampler_init_temp(float temp) {
986
- return new llama_sampler {
993
+ return llama_sampler_init(
987
994
  /* .iface = */ &llama_sampler_temp_i,
988
995
  /* .ctx = */ new llama_sampler_temp {
989
996
  /*.temp = */ temp,
990
- },
991
- };
997
+ }
998
+ );
992
999
  }
993
1000
 
994
1001
  // temp-ext
@@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
1093
1100
  };
1094
1101
 
1095
1102
  struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
1096
- return new llama_sampler {
1103
+ return llama_sampler_init(
1097
1104
  /* .iface = */ &llama_sampler_temp_ext_i,
1098
1105
  /* .ctx = */ new llama_sampler_temp_ext {
1099
1106
  /* .temp = */ temp,
1100
1107
  /* .delta = */ delta,
1101
1108
  /* .exponent = */ exponent,
1102
- },
1103
- };
1109
+ }
1110
+ );
1104
1111
  }
1105
1112
 
1106
1113
  // xtc
@@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
1185
1192
 
1186
1193
  struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
1187
1194
  auto seed_cur = get_rng_seed(seed);
1188
- return new llama_sampler {
1195
+ return llama_sampler_init(
1189
1196
  /* .iface = */ &llama_sampler_xtc_i,
1190
1197
  /* .ctx = */ new llama_sampler_xtc {
1191
1198
  /* .probability = */ p,
@@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
1194
1201
  /* .seed = */ seed,
1195
1202
  /* .seed_cur = */ seed_cur,
1196
1203
  /* .rng = */ std::mt19937(seed_cur),
1197
- },
1198
- };
1204
+ }
1205
+ );
1199
1206
  }
1200
1207
 
1201
1208
  // mirostat
@@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
1292
1299
 
1293
1300
  struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
1294
1301
  auto seed_cur = get_rng_seed(seed);
1295
- return new llama_sampler {
1302
+ return llama_sampler_init(
1296
1303
  /* .iface = */ &llama_sampler_mirostat_i,
1297
1304
  /* .ctx = */ new llama_sampler_mirostat {
1298
1305
  /* .n_vocab = */ n_vocab,
@@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
1303
1310
  /* .m = */ m,
1304
1311
  /* .mu = */ 2.0f*tau,
1305
1312
  /* .rng = */ std::mt19937(seed_cur),
1306
- },
1307
- };
1313
+ }
1314
+ );
1308
1315
  }
1309
1316
 
1310
1317
  // mirostat v2
@@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
1391
1398
 
1392
1399
  struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
1393
1400
  auto seed_cur = get_rng_seed(seed);
1394
- return new llama_sampler {
1401
+ return llama_sampler_init(
1395
1402
  /* .iface = */ &llama_sampler_mirostat_v2_i,
1396
1403
  /* .ctx = */ new llama_sampler_mirostat_v2 {
1397
1404
  /* .seed = */ seed,
@@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
1400
1407
  /* .eta = */ eta,
1401
1408
  /* .mu = */ 2.0f*tau,
1402
1409
  /* .rng = */ std::mt19937(seed_cur),
1403
- },
1404
- };
1410
+ }
1411
+ );
1405
1412
  }
1406
1413
 
1407
1414
  // grammar
@@ -1442,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1442
1449
  const char ** trigger_words,
1443
1450
  size_t num_trigger_words,
1444
1451
  const llama_token * trigger_tokens,
1445
- size_t num_trigger_tokens);
1452
+ size_t num_trigger_tokens,
1453
+ const char ** trigger_patterns,
1454
+ size_t num_trigger_patterns);
1446
1455
 
1447
1456
  static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1448
1457
  auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@@ -1450,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1450
1459
  return;
1451
1460
  }
1452
1461
 
1453
- std::vector<const char *> trigger_words;
1454
- for (auto & word : ctx->grammar->trigger_words) {
1455
- trigger_words.push_back(word.c_str());
1462
+ std::vector<const char *> trigger_patterns_c;
1463
+ trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
1464
+ for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
1465
+ trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
1456
1466
  }
1467
+
1457
1468
  auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
1458
- ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
1469
+ ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
1459
1470
  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
1460
1471
 
1461
1472
  llama_grammar_free_impl(ctx->grammar);
@@ -1465,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1465
1476
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1466
1477
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1467
1478
 
1468
- auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
1479
+ auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
1469
1480
 
1470
1481
  // copy the state
1471
1482
  {
@@ -1509,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1509
1520
  const char ** trigger_words,
1510
1521
  size_t num_trigger_words,
1511
1522
  const llama_token * trigger_tokens,
1512
- size_t num_trigger_tokens) {
1523
+ size_t num_trigger_tokens,
1524
+ const char ** trigger_patterns,
1525
+ size_t num_trigger_patterns) {
1513
1526
  auto * ctx = new llama_sampler_grammar;
1514
1527
 
1515
1528
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1529
+ // TODO: remove trigger_words support.
1530
+ if (trigger_words != nullptr && num_trigger_words > 0) {
1531
+ GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
1532
+ std::string trigger_pattern("[\\s\\S]*?(");
1533
+ for (size_t i = 0; i < num_trigger_words; ++i) {
1534
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
1535
+ if (i > 0) {
1536
+ trigger_pattern += "|";
1537
+ }
1538
+ trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
1539
+ }
1540
+ trigger_pattern += ")[\\s\\S]*";
1541
+ auto trigger_pattern_c = trigger_pattern.c_str();
1542
+ trigger_patterns = &trigger_pattern_c;
1543
+ num_trigger_patterns = 1;
1544
+ }
1516
1545
  *ctx = {
1517
1546
  /* .vocab = */ vocab,
1518
1547
  /* .grammar_str = */ grammar_str,
1519
1548
  /* .grammar_root = */ grammar_root,
1520
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
1549
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1521
1550
  };
1522
1551
  } else {
1523
1552
  *ctx = {
@@ -1528,17 +1557,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1528
1557
  };
1529
1558
  }
1530
1559
 
1531
- return new llama_sampler {
1560
+ return llama_sampler_init(
1532
1561
  /* .iface = */ &llama_sampler_grammar_i,
1533
- /* .ctx = */ ctx,
1534
- };
1562
+ /* .ctx = */ ctx
1563
+ );
1535
1564
  }
1536
1565
 
1537
1566
  struct llama_sampler * llama_sampler_init_grammar(
1538
1567
  const struct llama_vocab * vocab,
1539
1568
  const char * grammar_str,
1540
1569
  const char * grammar_root) {
1541
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
1570
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
1542
1571
  }
1543
1572
 
1544
1573
  struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1549,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
1549
1578
  size_t num_trigger_words,
1550
1579
  const llama_token * trigger_tokens,
1551
1580
  size_t num_trigger_tokens) {
1552
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
1581
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
1582
+ }
1583
+
1584
+ struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1585
+ const struct llama_vocab * vocab,
1586
+ const char * grammar_str,
1587
+ const char * grammar_root,
1588
+ const char ** trigger_patterns,
1589
+ size_t num_trigger_patterns,
1590
+ const llama_token * trigger_tokens,
1591
+ size_t num_trigger_tokens) {
1592
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
1553
1593
  }
1554
1594
 
1555
1595
  // penalties
@@ -1678,7 +1718,7 @@ struct llama_sampler * llama_sampler_init_penalties(
1678
1718
  float penalty_present) {
1679
1719
  penalty_last_n = std::max(penalty_last_n, 0);
1680
1720
 
1681
- return new llama_sampler {
1721
+ return llama_sampler_init(
1682
1722
  /* .iface = */ &llama_sampler_penalties_i,
1683
1723
  /* .ctx = */ new llama_sampler_penalties {
1684
1724
  /* .penalty_last_n = */ penalty_last_n,
@@ -1687,8 +1727,75 @@ struct llama_sampler * llama_sampler_init_penalties(
1687
1727
  /* .penalty_present = */ penalty_present,
1688
1728
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1689
1729
  /* .token_count = */ {},
1690
- },
1691
- };
1730
+ }
1731
+ );
1732
+ }
1733
+
1734
+ // top-n-sigma
1735
+
1736
+ struct llama_sampler_top_n_sigma {
1737
+ const float n;
1738
+ };
1739
+
1740
+ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
1741
+ return "top-n-sigma";
1742
+ }
1743
+
1744
+ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1745
+ const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
1746
+
1747
+ // find max logit and calculate mean
1748
+ float max = cur_p->data[0].logit;
1749
+ float logits_sum = 0;
1750
+ for (size_t i = 0; i < cur_p->size; ++i) {
1751
+ if (cur_p->data[i].logit > max) {
1752
+ max = cur_p->data[i].logit;
1753
+ }
1754
+ logits_sum += cur_p->data[i].logit;
1755
+ }
1756
+ float mean = logits_sum/cur_p->size;
1757
+
1758
+ // calculate standard deviation
1759
+ float acc = 0;
1760
+ for (size_t i = 0; i < cur_p->size; ++i) {
1761
+ acc += pow(cur_p->data[i].logit - mean, 2);
1762
+ }
1763
+ float std = sqrt(acc/cur_p->size);
1764
+
1765
+ //apply mask
1766
+ for (size_t i = 0; i < cur_p->size; ++i) {
1767
+ if (cur_p->data[i].logit < max - (ctx->n * std)) {
1768
+ cur_p->data[i].logit = -INFINITY;
1769
+ }
1770
+ }
1771
+ llama_sampler_softmax_impl(cur_p);
1772
+ }
1773
+
1774
+ static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
1775
+ const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
1776
+ return llama_sampler_init_top_n_sigma(ctx->n);
1777
+ }
1778
+
1779
+ static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
1780
+ delete (llama_sampler_top_n_sigma *) smpl->ctx;
1781
+ }
1782
+
1783
+ static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
1784
+ /* .name = */ llama_sampler_top_n_sigma_name,
1785
+ /* .accept = */ nullptr,
1786
+ /* .apply = */ llama_sampler_top_n_sigma_apply,
1787
+ /* .reset = */ nullptr,
1788
+ /* .clone = */ llama_sampler_top_n_sigma_clone,
1789
+ /* .free = */ llama_sampler_top_n_sigma_free,
1790
+ };
1791
+
1792
+ struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
1793
+ return llama_sampler_init(
1794
+ /* .iface = */ &llama_sampler_top_n_sigma_i,
1795
+ /* .ctx = */ new llama_sampler_top_n_sigma {
1796
+ /* .n = */ n,
1797
+ }
1798
+ );
1692
1799
  }
1693
1800
 
1694
1801
  // DRY
@@ -2041,7 +2148,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
2041
2148
  }
2042
2149
  }
2043
2150
 
2044
- return new llama_sampler {
2151
+ return llama_sampler_init(
2045
2152
  /* .iface = */ &llama_sampler_dry_i,
2046
2153
  /* .ctx = */ new llama_sampler_dry {
2047
2154
  /* .total_context_size = */ context_size,
@@ -2053,8 +2160,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
2053
2160
  /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
2054
2161
  /* .dry_max_token_repeat = */ {},
2055
2162
  /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
2056
- },
2057
- };
2163
+ }
2164
+ );
2058
2165
  }
2059
2166
 
2060
2167
  // wrapper for test-sampling.cpp
@@ -2155,14 +2262,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
2155
2262
  int32_t n_vocab,
2156
2263
  int32_t n_logit_bias,
2157
2264
  const llama_logit_bias * logit_bias) {
2158
- return new llama_sampler {
2265
+ return llama_sampler_init(
2159
2266
  /* .iface = */ &llama_sampler_logit_bias_i,
2160
2267
  /* .ctx = */ new llama_sampler_logit_bias {
2161
2268
  /* .n_vocab = */ n_vocab,
2162
2269
  /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
2163
2270
  /* .to_search = */ {},
2164
- },
2165
- };
2271
+ }
2272
+ );
2166
2273
  }
2167
2274
 
2168
2275
  // infill
@@ -2377,14 +2484,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
2377
2484
  };
2378
2485
 
2379
2486
  struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
2380
- return new llama_sampler {
2487
+ return llama_sampler_init(
2381
2488
  /* .iface = */ &llama_sampler_infill_i,
2382
2489
  /* .ctx = */ new llama_sampler_infill {
2383
2490
  /* .vocab = */ vocab,
2384
2491
  /* .buf0 = */ std::vector<char>(512),
2385
2492
  /* .buf1 = */ std::vector<char>(512),
2386
- },
2387
- };
2493
+ }
2494
+ );
2388
2495
  }
2389
2496
 
2390
2497
  // utils