@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -16,6 +16,7 @@
16
16
  #include <queue>
17
17
  #include <set>
18
18
  #include <unordered_map>
19
+ #include <cctype>
19
20
 
20
21
  //
21
22
  // helpers
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
392
393
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
393
394
  };
394
395
  break;
396
+ case LLAMA_VOCAB_PRE_TYPE_GPT4O:
397
+ regex_exprs = {
398
+ // original regex from tokenizer.json
399
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
400
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401
+ };
402
+ break;
395
403
  default:
396
404
  // default regex for BPE tokenization pre-processing
397
405
  regex_exprs = {
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1592
1600
  } else if (
1593
1601
  tokenizer_pre == "megrez") {
1594
1602
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1603
+ } else if (
1604
+ tokenizer_pre == "gpt-4o") {
1605
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1606
+ clean_spaces = false;
1595
1607
  } else {
1596
1608
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1597
1609
  }
@@ -4978,6 +4978,149 @@ struct llm_build_context {
4978
4978
  return gf;
4979
4979
  }
4980
4980
 
4981
+ struct ggml_cgraph * build_gemma3() {
4982
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4983
+
4984
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4985
+
4986
+ struct ggml_tensor * cur;
4987
+ struct ggml_tensor * inpL;
4988
+
4989
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
4990
+
4991
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
4992
+ if (ubatch.token) {
4993
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
4994
+ cb(inpL, "inp_scaled", -1);
4995
+ }
4996
+
4997
+ // inp_pos - contains the positions
4998
+ struct ggml_tensor * inp_pos = build_inp_pos();
4999
+
5000
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5001
+ // gemma3 requires different mask for layers using sliding window (SWA)
5002
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
5003
+ struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
5004
+
5005
+ // "5-to-1 interleaved attention"
5006
+ // 5 layers of local attention followed by 1 layer of global attention
5007
+ static const int sliding_window_pattern = 6;
5008
+
5009
+ for (int il = 0; il < n_layer; ++il) {
5010
+ const bool is_sliding = (il + 1) % sliding_window_pattern;
5011
+ const float freq_base_l = is_sliding ? 10000.0f : freq_base;
5012
+ const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
5013
+ struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
5014
+
5015
+ // norm
5016
+ cur = llm_build_norm(ctx0, inpL, hparams,
5017
+ model.layers[il].attn_norm, NULL,
5018
+ LLM_NORM_RMS, cb, il);
5019
+ cb(cur, "attn_norm", il);
5020
+
5021
+ // self-attention
5022
+ {
5023
+ // compute Q and K and RoPE them
5024
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
5025
+ cb(Qcur, "Qcur", il);
5026
+
5027
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
5028
+ cb(Kcur, "Kcur", il);
5029
+
5030
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
5031
+ cb(Vcur, "Vcur", il);
5032
+
5033
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
5034
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
5035
+ model.layers[il].attn_q_norm,
5036
+ NULL,
5037
+ LLM_NORM_RMS, cb, il);
5038
+ cb(Qcur, "Qcur_normed", il);
5039
+
5040
+ Qcur = ggml_rope_ext(
5041
+ ctx0, Qcur, inp_pos, nullptr,
5042
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
5043
+ ext_factor, attn_factor, beta_fast, beta_slow);
5044
+ cb(Qcur, "Qcur", il);
5045
+
5046
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
5047
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
5048
+ model.layers[il].attn_k_norm,
5049
+ NULL,
5050
+ LLM_NORM_RMS, cb, il);
5051
+ cb(Kcur, "Kcur_normed", il);
5052
+
5053
+ Kcur = ggml_rope_ext(
5054
+ ctx0, Kcur, inp_pos, nullptr,
5055
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
5056
+ ext_factor, attn_factor, beta_fast, beta_slow);
5057
+ cb(Kcur, "Kcur", il);
5058
+
5059
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
5060
+ model.layers[il].wo, NULL,
5061
+ Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
5062
+ }
5063
+
5064
+ cur = llm_build_norm(ctx0, cur, hparams,
5065
+ model.layers[il].attn_post_norm, NULL,
5066
+ LLM_NORM_RMS, cb, il);
5067
+ cb(cur, "attn_post_norm", il);
5068
+
5069
+ if (il == n_layer - 1) {
5070
+ // skip computing output for unused tokens
5071
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
5072
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5073
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5074
+ }
5075
+
5076
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
5077
+ cb(sa_out, "sa_out", il);
5078
+
5079
+ cur = llm_build_norm(ctx0, sa_out, hparams,
5080
+ model.layers[il].ffn_norm, NULL,
5081
+ LLM_NORM_RMS, cb, il);
5082
+ cb(cur, "ffn_norm", il);
5083
+
5084
+ // feed-forward network
5085
+ {
5086
+ cur = llm_build_ffn(ctx0, lctx, cur,
5087
+ model.layers[il].ffn_up, NULL, NULL,
5088
+ model.layers[il].ffn_gate, NULL, NULL,
5089
+ model.layers[il].ffn_down, NULL, NULL,
5090
+ NULL,
5091
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
5092
+ cb(cur, "ffn_out", il);
5093
+ }
5094
+
5095
+ cur = llm_build_norm(ctx0, cur, hparams,
5096
+ model.layers[il].ffn_post_norm, NULL,
5097
+ LLM_NORM_RMS, cb, -1);
5098
+ cb(cur, "ffn_post_norm", -1);
5099
+
5100
+ cur = ggml_add(ctx0, cur, sa_out);
5101
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
5102
+ cb(cur, "l_out", il);
5103
+
5104
+ // input for next layer
5105
+ inpL = cur;
5106
+ }
5107
+
5108
+ cur = inpL;
5109
+
5110
+ cur = llm_build_norm(ctx0, cur, hparams,
5111
+ model.output_norm, NULL,
5112
+ LLM_NORM_RMS, cb, -1);
5113
+ cb(cur, "result_norm", -1);
5114
+
5115
+ // lm_head
5116
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
5117
+
5118
+ cb(cur, "result_output", -1);
5119
+
5120
+ ggml_build_forward_expand(gf, cur);
5121
+
5122
+ return gf;
5123
+ }
4981
5124
 
4982
5125
  struct ggml_cgraph * build_starcoder2() {
4983
5126
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -8298,6 +8441,10 @@ static struct ggml_cgraph * llama_build_graph(
8298
8441
  {
8299
8442
  result = llm.build_gemma2();
8300
8443
  } break;
8444
+ case LLM_ARCH_GEMMA3:
8445
+ {
8446
+ result = llm.build_gemma3();
8447
+ } break;
8301
8448
  case LLM_ARCH_STARCODER2:
8302
8449
  {
8303
8450
  result = llm.build_starcoder2();
@@ -8801,12 +8948,14 @@ static int llama_decode_impl(
8801
8948
  //llama_synchronize(&lctx);
8802
8949
 
8803
8950
  // decide if we need to defrag the kv cache
8804
- if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
8805
- const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
8951
+ if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
8952
+ // - do not defrag small contexts (i.e. < 2048 tokens)
8953
+ // - count the padding towards the number of used tokens
8954
+ const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
8806
8955
 
8807
8956
  // queue defragmentation for next llama_kv_cache_update
8808
8957
  if (fragmentation > cparams.defrag_thold) {
8809
- //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8958
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
8810
8959
 
8811
8960
  llama_kv_cache_defrag(kv_self);
8812
8961
  }
@@ -9428,8 +9577,6 @@ static struct llama_model * llama_model_load_from_file_impl(
9428
9577
  struct llama_model_params params) {
9429
9578
  ggml_time_init();
9430
9579
 
9431
- llama_model * model = new llama_model(params);
9432
-
9433
9580
  unsigned cur_percentage = 0;
9434
9581
  if (params.progress_callback == NULL) {
9435
9582
  params.progress_callback_user_data = &cur_percentage;
@@ -9447,6 +9594,8 @@ static struct llama_model * llama_model_load_from_file_impl(
9447
9594
  };
9448
9595
  }
9449
9596
 
9597
+ llama_model * model = new llama_model(params);
9598
+
9450
9599
  // create list of devices to use with this model
9451
9600
  if (params.devices) {
9452
9601
  for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
618
618
  result.reserve(utf8.size());
619
619
  size_t offset = 0;
620
620
  while (offset < utf8.size()) {
621
- result.push_back(unicode_cpt_from_utf8(utf8, offset));
621
+ try {
622
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
623
+ }
624
+ catch (const std::invalid_argument & /*ex*/) {
625
+ // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
626
+ ++offset;
627
+ result.emplace_back(0xFFFD); // replacement character
628
+ }
622
629
  }
623
630
  return result;
624
631
  }
@@ -701,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
701
708
  const auto cpts = unicode_cpts_from_utf8(text);
702
709
 
703
710
  // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
704
- // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
711
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
705
712
  std::string text_collapsed;
706
713
  if (need_collapse) {
707
714
  // collapse all unicode categories