@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -9,6 +9,7 @@
9
9
  #include <algorithm>
10
10
  #include <cassert>
11
11
  #include <cstring>
12
+ #include <cmath>
12
13
  #include <functional>
13
14
  #include <map>
14
15
  #include <sstream>
@@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
864
865
  default: type = LLM_TYPE_UNKNOWN;
865
866
  }
866
867
  } break;
868
+ case LLM_ARCH_GEMMA3:
869
+ {
870
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
871
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
872
+
873
+ switch (hparams.n_layer) {
874
+ case 26: type = LLM_TYPE_1B; break;
875
+ case 34: type = LLM_TYPE_4B; break;
876
+ case 48: type = LLM_TYPE_12B; break;
877
+ case 62: type = LLM_TYPE_27B; break;
878
+ default: type = LLM_TYPE_UNKNOWN;
879
+ }
880
+
881
+ hparams.f_attention_scale = type == LLM_TYPE_27B
882
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
883
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
884
+ } break;
867
885
  case LLM_ARCH_STARCODER2:
868
886
  {
869
887
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1424,6 +1442,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1424
1442
  throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
1425
1443
  }
1426
1444
 
1445
+ // skip unused tensors
1446
+ if (info.op == GGML_OP_NONE) {
1447
+ LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
1448
+ ml.n_created++;
1449
+
1450
+ return nullptr;
1451
+ }
1452
+
1427
1453
  // tensors with "bias" suffix are always used with GGML_OP_ADD
1428
1454
  ggml_op op;
1429
1455
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
@@ -2194,13 +2220,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2194
2220
  } break;
2195
2221
  case LLM_ARCH_PHI3:
2196
2222
  {
2197
- const int64_t n_embd_head = n_embd / n_head;
2198
-
2199
2223
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2200
2224
 
2201
2225
  // output
2202
2226
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2203
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
2227
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2228
+
2229
+ // if output is NULL, init from the input tok embed
2230
+ if (output == NULL) {
2231
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2232
+ }
2204
2233
 
2205
2234
  for (int i = 0; i < n_layer; ++i) {
2206
2235
  auto & layer = layers[i];
@@ -2215,8 +2244,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2215
2244
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2216
2245
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
2217
2246
 
2218
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2219
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2247
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2248
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2220
2249
  }
2221
2250
  } break;
2222
2251
  case LLM_ARCH_PHIMOE:
@@ -2443,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2443
2472
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2444
2473
  layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2445
2474
 
2475
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2476
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2477
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2478
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2479
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2480
+ }
2481
+ } break;
2482
+ case LLM_ARCH_GEMMA3:
2483
+ {
2484
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2485
+
2486
+ // output
2487
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2488
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2489
+
2490
+ for (int i = 0; i < n_layer; ++i) {
2491
+ auto & layer = layers[i];
2492
+
2493
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2494
+
2495
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2496
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2497
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2498
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2499
+
2500
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2501
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2502
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2503
+
2446
2504
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2447
2505
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2448
2506
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -3639,6 +3697,7 @@ void llama_model::print_info() const {
3639
3697
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
3640
3698
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
3641
3699
  LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
3700
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
3642
3701
  LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
3643
3702
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3644
3703
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
@@ -3830,6 +3889,10 @@ int32_t llama_model_n_head(const struct llama_model * model) {
3830
3889
  return model->hparams.n_head();
3831
3890
  }
3832
3891
 
3892
+ int32_t llama_model_n_head_kv(const struct llama_model * model) {
3893
+ return model->hparams.n_head_kv();
3894
+ }
3895
+
3833
3896
  // deprecated
3834
3897
  int32_t llama_n_ctx_train(const struct llama_model * model) {
3835
3898
  return llama_model_n_ctx_train(model);
@@ -3908,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
3908
3971
  case LLM_ARCH_PHIMOE:
3909
3972
  case LLM_ARCH_GEMMA:
3910
3973
  case LLM_ARCH_GEMMA2:
3974
+ case LLM_ARCH_GEMMA3:
3911
3975
  case LLM_ARCH_STARCODER2:
3912
3976
  case LLM_ARCH_OPENELM:
3913
3977
  case LLM_ARCH_GPTNEOX:
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1449
1449
  const char ** trigger_words,
1450
1450
  size_t num_trigger_words,
1451
1451
  const llama_token * trigger_tokens,
1452
- size_t num_trigger_tokens);
1452
+ size_t num_trigger_tokens,
1453
+ const char ** trigger_patterns,
1454
+ size_t num_trigger_patterns);
1453
1455
 
1454
1456
  static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1455
1457
  auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1457
1459
  return;
1458
1460
  }
1459
1461
 
1460
- std::vector<const char *> trigger_words;
1461
- for (auto & word : ctx->grammar->trigger_words) {
1462
- trigger_words.push_back(word.c_str());
1462
+ std::vector<const char *> trigger_patterns_c;
1463
+ trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
1464
+ for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
1465
+ trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
1463
1466
  }
1467
+
1464
1468
  auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
1465
- ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
1469
+ ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
1466
1470
  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
1467
1471
 
1468
1472
  llama_grammar_free_impl(ctx->grammar);
@@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1472
1476
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1473
1477
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1474
1478
 
1475
- auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
1479
+ auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
1476
1480
 
1477
1481
  // copy the state
1478
1482
  {
@@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1516
1520
  const char ** trigger_words,
1517
1521
  size_t num_trigger_words,
1518
1522
  const llama_token * trigger_tokens,
1519
- size_t num_trigger_tokens) {
1523
+ size_t num_trigger_tokens,
1524
+ const char ** trigger_patterns,
1525
+ size_t num_trigger_patterns) {
1520
1526
  auto * ctx = new llama_sampler_grammar;
1521
1527
 
1522
1528
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1529
+ // TODO: remove trigger_words support.
1530
+ if (trigger_words != nullptr && num_trigger_words > 0) {
1531
+ GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
1532
+ std::string trigger_pattern("[\\s\\S]*?(");
1533
+ for (size_t i = 0; i < num_trigger_words; ++i) {
1534
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
1535
+ if (i > 0) {
1536
+ trigger_pattern += "|";
1537
+ }
1538
+ trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
1539
+ }
1540
+ trigger_pattern += ")[\\s\\S]*";
1541
+ auto trigger_pattern_c = trigger_pattern.c_str();
1542
+ trigger_patterns = &trigger_pattern_c;
1543
+ num_trigger_patterns = 1;
1544
+ }
1523
1545
  *ctx = {
1524
1546
  /* .vocab = */ vocab,
1525
1547
  /* .grammar_str = */ grammar_str,
1526
1548
  /* .grammar_root = */ grammar_root,
1527
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
1549
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1528
1550
  };
1529
1551
  } else {
1530
1552
  *ctx = {
@@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
1545
1567
  const struct llama_vocab * vocab,
1546
1568
  const char * grammar_str,
1547
1569
  const char * grammar_root) {
1548
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
1570
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
1549
1571
  }
1550
1572
 
1551
1573
  struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
1556
1578
  size_t num_trigger_words,
1557
1579
  const llama_token * trigger_tokens,
1558
1580
  size_t num_trigger_tokens) {
1559
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
1581
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
1582
+ }
1583
+
1584
+ struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1585
+ const struct llama_vocab * vocab,
1586
+ const char * grammar_str,
1587
+ const char * grammar_root,
1588
+ const char ** trigger_patterns,
1589
+ size_t num_trigger_patterns,
1590
+ const llama_token * trigger_tokens,
1591
+ size_t num_trigger_tokens) {
1592
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
1560
1593
  }
1561
1594
 
1562
1595
  // penalties
@@ -16,6 +16,7 @@
16
16
  #include <queue>
17
17
  #include <set>
18
18
  #include <unordered_map>
19
+ #include <cctype>
19
20
 
20
21
  //
21
22
  // helpers
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
392
393
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
393
394
  };
394
395
  break;
396
+ case LLAMA_VOCAB_PRE_TYPE_GPT4O:
397
+ regex_exprs = {
398
+ // original regex from tokenizer.json
399
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
400
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401
+ };
402
+ break;
395
403
  default:
396
404
  // default regex for BPE tokenization pre-processing
397
405
  regex_exprs = {
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1592
1600
  } else if (
1593
1601
  tokenizer_pre == "megrez") {
1594
1602
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1603
+ } else if (
1604
+ tokenizer_pre == "gpt-4o") {
1605
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1606
+ clean_spaces = false;
1595
1607
  } else {
1596
1608
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1597
1609
  }
@@ -4978,6 +4978,149 @@ struct llm_build_context {
4978
4978
  return gf;
4979
4979
  }
4980
4980
 
4981
+ struct ggml_cgraph * build_gemma3() {
4982
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4983
+
4984
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4985
+
4986
+ struct ggml_tensor * cur;
4987
+ struct ggml_tensor * inpL;
4988
+
4989
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
4990
+
4991
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
4992
+ if (ubatch.token) {
4993
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
4994
+ cb(inpL, "inp_scaled", -1);
4995
+ }
4996
+
4997
+ // inp_pos - contains the positions
4998
+ struct ggml_tensor * inp_pos = build_inp_pos();
4999
+
5000
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5001
+ // gemma3 requires different mask for layers using sliding window (SWA)
5002
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
5003
+ struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
5004
+
5005
+ // "5-to-1 interleaved attention"
5006
+ // 5 layers of local attention followed by 1 layer of global attention
5007
+ static const int sliding_window_pattern = 6;
5008
+
5009
+ for (int il = 0; il < n_layer; ++il) {
5010
+ const bool is_sliding = (il + 1) % sliding_window_pattern;
5011
+ const float freq_base_l = is_sliding ? 10000.0f : freq_base;
5012
+ const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
5013
+ struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
5014
+
5015
+ // norm
5016
+ cur = llm_build_norm(ctx0, inpL, hparams,
5017
+ model.layers[il].attn_norm, NULL,
5018
+ LLM_NORM_RMS, cb, il);
5019
+ cb(cur, "attn_norm", il);
5020
+
5021
+ // self-attention
5022
+ {
5023
+ // compute Q and K and RoPE them
5024
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
5025
+ cb(Qcur, "Qcur", il);
5026
+
5027
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
5028
+ cb(Kcur, "Kcur", il);
5029
+
5030
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
5031
+ cb(Vcur, "Vcur", il);
5032
+
5033
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
5034
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
5035
+ model.layers[il].attn_q_norm,
5036
+ NULL,
5037
+ LLM_NORM_RMS, cb, il);
5038
+ cb(Qcur, "Qcur_normed", il);
5039
+
5040
+ Qcur = ggml_rope_ext(
5041
+ ctx0, Qcur, inp_pos, nullptr,
5042
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
5043
+ ext_factor, attn_factor, beta_fast, beta_slow);
5044
+ cb(Qcur, "Qcur", il);
5045
+
5046
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
5047
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
5048
+ model.layers[il].attn_k_norm,
5049
+ NULL,
5050
+ LLM_NORM_RMS, cb, il);
5051
+ cb(Kcur, "Kcur_normed", il);
5052
+
5053
+ Kcur = ggml_rope_ext(
5054
+ ctx0, Kcur, inp_pos, nullptr,
5055
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
5056
+ ext_factor, attn_factor, beta_fast, beta_slow);
5057
+ cb(Kcur, "Kcur", il);
5058
+
5059
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
5060
+ model.layers[il].wo, NULL,
5061
+ Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
5062
+ }
5063
+
5064
+ cur = llm_build_norm(ctx0, cur, hparams,
5065
+ model.layers[il].attn_post_norm, NULL,
5066
+ LLM_NORM_RMS, cb, il);
5067
+ cb(cur, "attn_post_norm", il);
5068
+
5069
+ if (il == n_layer - 1) {
5070
+ // skip computing output for unused tokens
5071
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
5072
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5073
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5074
+ }
5075
+
5076
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
5077
+ cb(sa_out, "sa_out", il);
5078
+
5079
+ cur = llm_build_norm(ctx0, sa_out, hparams,
5080
+ model.layers[il].ffn_norm, NULL,
5081
+ LLM_NORM_RMS, cb, il);
5082
+ cb(cur, "ffn_norm", il);
5083
+
5084
+ // feed-forward network
5085
+ {
5086
+ cur = llm_build_ffn(ctx0, lctx, cur,
5087
+ model.layers[il].ffn_up, NULL, NULL,
5088
+ model.layers[il].ffn_gate, NULL, NULL,
5089
+ model.layers[il].ffn_down, NULL, NULL,
5090
+ NULL,
5091
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
5092
+ cb(cur, "ffn_out", il);
5093
+ }
5094
+
5095
+ cur = llm_build_norm(ctx0, cur, hparams,
5096
+ model.layers[il].ffn_post_norm, NULL,
5097
+ LLM_NORM_RMS, cb, -1);
5098
+ cb(cur, "ffn_post_norm", -1);
5099
+
5100
+ cur = ggml_add(ctx0, cur, sa_out);
5101
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
5102
+ cb(cur, "l_out", il);
5103
+
5104
+ // input for next layer
5105
+ inpL = cur;
5106
+ }
5107
+
5108
+ cur = inpL;
5109
+
5110
+ cur = llm_build_norm(ctx0, cur, hparams,
5111
+ model.output_norm, NULL,
5112
+ LLM_NORM_RMS, cb, -1);
5113
+ cb(cur, "result_norm", -1);
5114
+
5115
+ // lm_head
5116
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
5117
+
5118
+ cb(cur, "result_output", -1);
5119
+
5120
+ ggml_build_forward_expand(gf, cur);
5121
+
5122
+ return gf;
5123
+ }
4981
5124
 
4982
5125
  struct ggml_cgraph * build_starcoder2() {
4983
5126
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -8298,6 +8441,10 @@ static struct ggml_cgraph * llama_build_graph(
8298
8441
  {
8299
8442
  result = llm.build_gemma2();
8300
8443
  } break;
8444
+ case LLM_ARCH_GEMMA3:
8445
+ {
8446
+ result = llm.build_gemma3();
8447
+ } break;
8301
8448
  case LLM_ARCH_STARCODER2:
8302
8449
  {
8303
8450
  result = llm.build_starcoder2();