@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -55,6 +55,8 @@ const std::vector<std::string> type_names = {
55
55
  "q4_k",
56
56
  "q5_k",
57
57
  "q6_k",
58
+ "iq1_s",
59
+ "iq1_m",
58
60
  "iq2_xxs",
59
61
  "iq2_xs",
60
62
  "iq2_s",
@@ -182,6 +184,13 @@ std::string to_uppercase(const std::string& input) {
182
184
  return result;
183
185
  }
184
186
 
187
+ bool string_starts_with(const std::string& str, const std::string& prefix) {
188
+ if (prefix.size() > str.size()) {
189
+ return false;
190
+ }
191
+ return std::equal(prefix.begin(), prefix.end(), str.begin());
192
+ }
193
+
185
194
  bool string_ends_with(const std::string& str, const std::string& suffix) {
186
195
  if (suffix.size() > str.size()) {
187
196
  return false;
@@ -316,11 +325,17 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
316
325
  string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
317
326
 
318
327
  for (const auto& tname : type_names) {
328
+ std::string load_vec_quant = "2";
329
+ if ((tname == "q4_0") || (tname == "q4_1"))
330
+ load_vec_quant = "8";
331
+ else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl"))
332
+ load_vec_quant = "4";
333
+
319
334
  std::string data_a_key = "DATA_A_" + to_uppercase(tname);
320
335
  // For unaligned, load one at a time for f32/f16, or two at a time for quants
321
- std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
336
+ std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant;
322
337
  // For aligned matmul loads
323
- std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
338
+ std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant;
324
339
 
325
340
  // don't generate f32 variants for coopmat2
326
341
  if (!coopmat2) {
@@ -387,7 +402,7 @@ void process_shaders() {
387
402
  for (const auto& tname : type_names) {
388
403
  // mul mat vec
389
404
  std::string data_a_key = "DATA_A_" + to_uppercase(tname);
390
- std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
405
+ std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
391
406
 
392
407
  string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
393
408
  string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
@@ -418,6 +433,7 @@ void process_shaders() {
418
433
  string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
419
434
  string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
420
435
  string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
436
+ string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
421
437
 
422
438
  string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
423
439
  string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
@@ -434,6 +450,8 @@ void process_shaders() {
434
450
  string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
435
451
  string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
436
452
 
453
+ string_to_spv("sub_f32", "sub.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
454
+
437
455
  string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
438
456
 
439
457
  string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@@ -443,6 +461,7 @@ void process_shaders() {
443
461
  string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
444
462
 
445
463
  string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
464
+ string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
446
465
 
447
466
  string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
448
467
 
@@ -465,14 +484,17 @@ void process_shaders() {
465
484
  string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
466
485
  string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
467
486
  string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
487
+ string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
468
488
  string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
469
489
  string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
470
490
  string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
491
+ string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
471
492
 
472
493
  string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
473
494
 
474
495
  string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
475
496
  string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
497
+ string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
476
498
 
477
499
  string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
478
500
  string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
@@ -482,9 +504,19 @@ void process_shaders() {
482
504
  string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
483
505
  string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
484
506
 
507
+ string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
508
+ string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
509
+ string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
510
+
511
+ string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
512
+ string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
513
+ string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
514
+
485
515
  string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
486
516
 
517
+ string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
487
518
  string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
519
+ string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
488
520
 
489
521
  string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
490
522
  string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
@@ -496,6 +528,8 @@ void process_shaders() {
496
528
 
497
529
  string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
498
530
 
531
+ string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
532
+
499
533
  for (auto &c : compiles) {
500
534
  c.wait();
501
535
  }
@@ -240,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
240
240
 
241
241
 
242
242
  void * ggml_aligned_malloc(size_t size) {
243
+ #if defined(__s390x__)
244
+ const int alignment = 256;
245
+ #else
243
246
  const int alignment = 64;
247
+ #endif
244
248
 
245
249
  #if defined(_MSC_VER) || defined(__MINGW32__)
246
250
  return _aligned_malloc(size, alignment);
@@ -561,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
561
565
  #endif
562
566
 
563
567
  }
564
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
565
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
566
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
568
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
569
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
570
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
567
571
 
568
572
  static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
569
573
  [GGML_TYPE_I8] = {
@@ -1379,7 +1383,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
1379
1383
  (t0->nb[3] == t1->nb[3]);
1380
1384
  }
1381
1385
 
1382
- // check if t1 can be represented as a repeatition of t0
1386
+ // check if t1 can be represented as a repetition of t0
1383
1387
  bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1384
1388
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1385
1389
 
@@ -2328,6 +2332,7 @@ struct ggml_tensor * ggml_concat(
2328
2332
  struct ggml_tensor * b,
2329
2333
  int dim) {
2330
2334
  GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
2335
+ GGML_ASSERT(a->type == b->type);
2331
2336
 
2332
2337
  int64_t ne[GGML_MAX_DIMS];
2333
2338
  for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -105,6 +105,7 @@ extern "C" {
105
105
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
106
106
  LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
107
107
  LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
108
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
108
109
  };
109
110
 
110
111
  enum llama_rope_type {
@@ -213,7 +214,7 @@ extern "C" {
213
214
  LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
214
215
  };
215
216
 
216
- // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
217
+ // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
217
218
  typedef struct llama_token_data {
218
219
  llama_token id; // token id
219
220
  float logit; // log-odds of the token
@@ -307,7 +308,7 @@ extern "C" {
307
308
  };
308
309
 
309
310
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
310
- // https://github.com/ggerganov/llama.cpp/pull/7544
311
+ // https://github.com/ggml-org/llama.cpp/pull/7544
311
312
  struct llama_context_params {
312
313
  uint32_t n_ctx; // text context, 0 = from model
313
314
  uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -320,7 +321,7 @@ extern "C" {
320
321
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
321
322
  enum llama_attention_type attention_type; // attention type to use for embeddings
322
323
 
323
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
324
+ // ref: https://github.com/ggml-org/llama.cpp/pull/2054
324
325
  float rope_freq_base; // RoPE base frequency, 0 = from model
325
326
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
326
327
  float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -385,7 +386,7 @@ extern "C" {
385
386
  struct llama_adapter_lora;
386
387
 
387
388
  // Helpers for getting default parameters
388
- // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
389
+ // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
389
390
  LLAMA_API struct llama_model_params llama_model_default_params(void);
390
391
  LLAMA_API struct llama_context_params llama_context_default_params(void);
391
392
  LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
@@ -477,6 +478,7 @@ extern "C" {
477
478
  LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
478
479
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
479
480
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
481
+ LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
480
482
 
481
483
  // Get the model's RoPE frequency scaling factor
482
484
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -1040,7 +1042,7 @@ extern "C" {
1040
1042
 
1041
1043
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
1042
1044
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
1043
- /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1045
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1044
1046
  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
1045
1047
  /// @param chat Pointer to a list of multiple llama_chat_message
1046
1048
  /// @param n_msg Number of llama_chat_message in this chat
@@ -1114,11 +1116,12 @@ extern "C" {
1114
1116
  };
1115
1117
 
1116
1118
  struct llama_sampler {
1117
- struct llama_sampler_i * iface;
1118
- llama_sampler_context_t ctx;
1119
+ const struct llama_sampler_i * iface;
1120
+ llama_sampler_context_t ctx;
1119
1121
  };
1120
1122
 
1121
1123
  // mirror of llama_sampler_i:
1124
+ LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1122
1125
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1123
1126
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1124
1127
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1148,7 +1151,7 @@ extern "C" {
1148
1151
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1149
1152
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1150
1153
  DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1151
- "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1154
+ "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1152
1155
 
1153
1156
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1154
1157
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1156,7 +1159,7 @@ extern "C" {
1156
1159
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1157
1160
  LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
1158
1161
 
1159
- /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1162
+ /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
1160
1163
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1161
1164
 
1162
1165
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -1171,6 +1174,9 @@ extern "C" {
1171
1174
  /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1172
1175
  LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1173
1176
 
1177
+ /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
1178
+ LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
1179
+
1174
1180
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1175
1181
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1176
1182
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1199,17 +1205,29 @@ extern "C" {
1199
1205
  const char * grammar_str,
1200
1206
  const char * grammar_root);
1201
1207
 
1202
- /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
1203
- /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
1204
- /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
1205
- LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1208
+ DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1206
1209
  const struct llama_vocab * vocab,
1207
1210
  const char * grammar_str,
1208
1211
  const char * grammar_root,
1209
1212
  const char ** trigger_words,
1210
1213
  size_t num_trigger_words,
1211
1214
  const llama_token * trigger_tokens,
1212
- size_t num_trigger_tokens);
1215
+ size_t num_trigger_tokens),
1216
+ "use llama_sampler_init_grammar_lazy_patterns instead");
1217
+
1218
+
1219
+ /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
1220
+ /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
1221
+ /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
1222
+ LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1223
+ const struct llama_vocab * vocab,
1224
+ const char * grammar_str,
1225
+ const char * grammar_root,
1226
+ const char ** trigger_patterns,
1227
+ size_t num_trigger_patterns,
1228
+ const llama_token * trigger_tokens,
1229
+ size_t num_trigger_tokens);
1230
+
1213
1231
 
1214
1232
  /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1215
1233
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 1165 220 19 220 27124 5503
2
+ 37 19194 259
3
+
4
+ 220
5
+ 256
6
+ 271
7
+ 197
8
+ 198
9
+ 279
10
+ 2499
11
+ 2775
12
+ 13225 2375
13
+ 32949 2375
14
+ 13225 5922
15
+ 32949 5922
16
+ 32949 5922 0
17
+ 13225 11 2375 0
18
+ 32949 11 2375 0
19
+ 495 382 9552 99 247 13 17159
20
+ 86 45404 220 22 10191 2852 22924 4750 6916
21
+ 3907 53641 1235 185386 8118
22
+ 11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
23
+ 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
24
+ 13225
25
+ 32949
26
+ 220 32949
27
+ 256 32949
28
+ 271 32949
29
+ 271 32949 198 271 32949
30
+ 350
31
+ 198 314
32
+ 6 6837
33
+ 13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
34
+ 147475
35
+ 18
36
+ 2546
37
+ 15517
38
+ 15517 18
39
+ 15517 2546
40
+ 15517 15517
41
+ 15517 15517 18
42
+ 15517 15517 2546
43
+ 15517 15517 15517
44
+ 34 60213 53904
45
+ 2960 3098
46
+ 126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
@@ -10,3 +10,4 @@
10
10
  -r ./requirements-convert_hf_to_gguf_update.txt
11
11
  -r ./requirements-convert_legacy_llama.txt
12
12
  -r ./requirements-convert_llama_ggml_to_gguf.txt
13
+ -r ./requirements-tool_bench.txt
@@ -0,0 +1,12 @@
1
+ aiohttp~=3.9.3
2
+ pytest~=8.3.3
3
+ huggingface_hub~=0.23.2
4
+ matplotlib~=3.10.0
5
+ numpy~=1.26.4
6
+ openai~=1.55.3
7
+ pandas~=2.2.3
8
+ prometheus-client~=0.20.0
9
+ requests~=2.32.3
10
+ wget~=3.2
11
+ typer~=0.15.1
12
+ seaborn~=0.13.2
@@ -10,3 +10,4 @@
10
10
  -r ./requirements/requirements-convert_hf_to_gguf_update.txt
11
11
  -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
12
12
  -r ./requirements/requirements-convert_lora_to_gguf.txt
13
+ -r ./requirements/requirements-tool_bench.txt
@@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
36
36
  { LLM_ARCH_MINICPM3, "minicpm3" },
37
37
  { LLM_ARCH_GEMMA, "gemma" },
38
38
  { LLM_ARCH_GEMMA2, "gemma2" },
39
+ { LLM_ARCH_GEMMA3, "gemma3" },
39
40
  { LLM_ARCH_STARCODER2, "starcoder2" },
40
41
  { LLM_ARCH_MAMBA, "mamba" },
41
42
  { LLM_ARCH_XVERSE, "xverse" },
@@ -766,6 +767,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
766
767
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
767
768
  },
768
769
  },
770
+ {
771
+ LLM_ARCH_GEMMA3,
772
+ {
773
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
774
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
775
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
776
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
777
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
778
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
779
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
780
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
781
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
782
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
783
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
784
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
785
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
786
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
787
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
788
+ },
789
+ },
769
790
  {
770
791
  LLM_ARCH_STARCODER2,
771
792
  {
@@ -40,6 +40,7 @@ enum llm_arch {
40
40
  LLM_ARCH_MINICPM3,
41
41
  LLM_ARCH_GEMMA,
42
42
  LLM_ARCH_GEMMA2,
43
+ LLM_ARCH_GEMMA3,
43
44
  LLM_ARCH_STARCODER2,
44
45
  LLM_ARCH_MAMBA,
45
46
  LLM_ARCH_XVERSE,
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include <map>
6
6
  #include <sstream>
7
+ #include <algorithm>
7
8
 
8
9
  #if __cplusplus >= 202000L
9
10
  #define LU8(x) (const char*)(u8##x)