llama_cpp 0.16.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -21,6 +21,10 @@
21
21
  # include "ggml-kompute.h"
22
22
  #endif
23
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
24
28
  #ifdef GGML_USE_METAL
25
29
  # include "ggml-metal.h"
26
30
  #endif
@@ -704,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
704
708
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
705
709
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
706
710
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
711
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
707
712
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
708
713
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
709
714
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -2298,9 +2303,13 @@ struct llama_context {
2298
2303
  std::vector<ggml_backend_t> backends;
2299
2304
  #ifdef GGML_USE_METAL
2300
2305
  ggml_backend_t backend_metal = nullptr;
2306
+ #endif
2307
+ #ifdef GGML_USE_BLAS
2308
+ ggml_backend_t backend_blas = nullptr;
2301
2309
  #endif
2302
2310
  ggml_backend_t backend_cpu = nullptr;
2303
2311
 
2312
+
2304
2313
  const llama_model & model;
2305
2314
 
2306
2315
  // key + value cache for the self attention
@@ -4552,35 +4561,6 @@ static void llm_load_vocab(
4552
4561
  vocab.special_cls_id = -1;
4553
4562
  vocab.special_mask_id = -1;
4554
4563
 
4555
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4556
- // prior to support of FIM special tokens in GGUF, the following
4557
- // will allow those models to continue to work. The general names
4558
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4559
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4560
- // new versions of these models have been published.
4561
- std::string gen_name;
4562
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4563
-
4564
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4565
- [](unsigned char c){ return std::tolower(c); });
4566
-
4567
- if (gen_name.find("code") != std::string::npos) {
4568
- if (model.arch == LLM_ARCH_LLAMA) {
4569
- vocab.special_prefix_id = 32007;
4570
- vocab.special_suffix_id = 32008;
4571
- vocab.special_middle_id = 32009;
4572
- vocab.special_eot_id = 32010;
4573
- } else if (model.arch == LLM_ARCH_GEMMA) {
4574
- vocab.special_prefix_id = 67;
4575
- vocab.special_suffix_id = 69;
4576
- vocab.special_middle_id = 68;
4577
- // TODO: this is not EOT, it is "file separator" token, needs fix
4578
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4579
- //vocab.special_eot_id = 70;
4580
- vocab.special_eot_id = 107;
4581
- }
4582
- }
4583
-
4584
4564
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4585
4565
  if (add_space_prefix_keyidx != -1) {
4586
4566
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4653,8 +4633,7 @@ static void llm_load_vocab(
4653
4633
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4654
4634
  LLAMA_LOG_WARN("%s: \n", __func__);
4655
4635
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4656
- } else if (
4657
- tokenizer_pre == "default") {
4636
+ } else if (tokenizer_pre == "default") {
4658
4637
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4659
4638
  } else if (
4660
4639
  tokenizer_pre == "llama3" ||
@@ -4681,7 +4660,8 @@ static void llm_load_vocab(
4681
4660
  tokenizer_pre == "jina-es" ||
4682
4661
  tokenizer_pre == "jina-de" ||
4683
4662
  tokenizer_pre == "jina-v2-es" ||
4684
- tokenizer_pre == "jina-v2-de") {
4663
+ tokenizer_pre == "jina-v2-de" ||
4664
+ tokenizer_pre == "jina-v2-code") {
4685
4665
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4686
4666
  } else if (
4687
4667
  tokenizer_pre == "refact") {
@@ -4704,6 +4684,9 @@ static void llm_load_vocab(
4704
4684
  } else if (
4705
4685
  tokenizer_pre == "smaug-bpe") {
4706
4686
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4687
+ } else if (
4688
+ tokenizer_pre == "poro-chat") {
4689
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4707
4690
  } else {
4708
4691
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4709
4692
  }
@@ -4761,6 +4744,45 @@ static void llm_load_vocab(
4761
4744
 
4762
4745
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4763
4746
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4748
+ // prior to support of FIM special tokens in GGUF, the following
4749
+ // will allow those models to continue to work. The general names
4750
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752
+ // new versions of these models have been published.
4753
+ std::string gen_name;
4754
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755
+
4756
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757
+ [](unsigned char c){ return std::tolower(c); });
4758
+
4759
+ if (gen_name.find("code") != std::string::npos) {
4760
+ if (model.arch == LLM_ARCH_LLAMA
4761
+ && 32010 < vocab.id_to_token.size()
4762
+ && vocab.id_to_token[32007].text == "<PRE>"
4763
+ && vocab.id_to_token[32008].text == "<SUF>"
4764
+ && vocab.id_to_token[32009].text == "<MID>"
4765
+ && vocab.id_to_token[32010].text == "<EOT>") {
4766
+ vocab.special_prefix_id = 32007;
4767
+ vocab.special_suffix_id = 32008;
4768
+ vocab.special_middle_id = 32009;
4769
+ vocab.special_eot_id = 32010;
4770
+ } else if (model.arch == LLM_ARCH_GEMMA
4771
+ && 107 < vocab.id_to_token.size()
4772
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4773
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4774
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4775
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4776
+ vocab.special_prefix_id = 67;
4777
+ vocab.special_suffix_id = 69;
4778
+ vocab.special_middle_id = 68;
4779
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4780
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781
+ //vocab.special_eot_id = 70;
4782
+ vocab.special_eot_id = 107;
4783
+ }
4784
+ }
4785
+
4764
4786
  try {
4765
4787
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4766
4788
  } catch (const std::exception & e) {
@@ -5515,7 +5537,7 @@ static bool llm_load_tensors(
5515
5537
 
5516
5538
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5517
5539
  } else {
5518
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5540
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5519
5541
  }
5520
5542
 
5521
5543
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5556,6 +5578,9 @@ static bool llm_load_tensors(
5556
5578
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5557
5579
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5558
5580
 
5581
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5582
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5583
+
5559
5584
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5560
5585
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5561
5586
 
@@ -8519,6 +8544,11 @@ struct llm_build_context {
8519
8544
  // attention layer norm
8520
8545
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8521
8546
 
8547
+ if (model.layers[il].attn_norm_2 != nullptr) {
8548
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8549
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8550
+ }
8551
+
8522
8552
  struct ggml_tensor * ffn_inp = cur;
8523
8553
  cb(ffn_inp, "ffn_inp", il);
8524
8554
 
@@ -11520,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
11520
11550
  if (batch.n_tokens < 32 || full_offload) {
11521
11551
  if (il != -1 && strcmp(name, "norm") == 0) {
11522
11552
  for (auto * backend : lctx.backends) {
11523
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11553
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11554
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11524
11555
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11525
11556
  break;
11526
11557
  }
@@ -12017,6 +12048,11 @@ static void llama_graph_compute(
12017
12048
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
12018
12049
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
12019
12050
  }
12051
+ #ifdef GGML_USE_BLAS
12052
+ if (lctx.backend_blas != nullptr) {
12053
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12054
+ }
12055
+ #endif
12020
12056
 
12021
12057
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
12022
12058
 
@@ -12239,17 +12275,6 @@ static int llama_decode_internal(
12239
12275
  }
12240
12276
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12241
12277
 
12242
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12243
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12244
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12245
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12246
- // with the BLAS calls. need a better solution
12247
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12248
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12249
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12250
- n_threads = std::min(4, n_threads);
12251
- }
12252
-
12253
12278
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12254
12279
 
12255
12280
  llama_set_inputs(lctx, u_batch);
@@ -13016,6 +13041,11 @@ struct llm_tokenizer_bpe {
13016
13041
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13017
13042
  });
13018
13043
  break;
13044
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
+ word_collection = unicode_regex_split(text, {
13046
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
+ });
13048
+ break;
13019
13049
  default:
13020
13050
  // default regex for BPE tokenization pre-processing
13021
13051
  word_collection = unicode_regex_split(text, {
@@ -13631,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13631
13661
  const uint32_t chr) {
13632
13662
 
13633
13663
  bool found = false;
13634
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13664
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13635
13665
 
13636
13666
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13637
13667
 
@@ -13640,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13640
13670
  // inclusive range, e.g. [a-z]
13641
13671
  found = found || (pos->value <= chr && chr <= pos[1].value);
13642
13672
  pos += 2;
13673
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13674
+ // Any character matches "."
13675
+ found = true;
13676
+ pos += 1;
13643
13677
  } else {
13644
13678
  // exact char match, e.g. [a] or "a"
13645
13679
  found = found || pos->value == chr;
@@ -13657,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
13657
13691
  const llama_grammar_element * pos,
13658
13692
  const llama_partial_utf8 partial_utf8) {
13659
13693
 
13660
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13694
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13661
13695
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13662
13696
 
13663
13697
  uint32_t partial_value = partial_utf8.value;
@@ -13687,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
13687
13721
  return is_positive_char;
13688
13722
  }
13689
13723
  pos += 2;
13724
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13725
+ // Any character matches "."
13726
+ return true;
13690
13727
  } else {
13691
13728
  // exact char match, e.g. [a] or "a"
13692
13729
  if (low <= pos->value && pos->value <= high) {
@@ -13747,6 +13784,7 @@ static void llama_grammar_advance_stack(
13747
13784
  }
13748
13785
  case LLAMA_GRETYPE_CHAR:
13749
13786
  case LLAMA_GRETYPE_CHAR_NOT:
13787
+ case LLAMA_GRETYPE_CHAR_ANY:
13750
13788
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13751
13789
  // only add the stack if it's not a duplicate of one we already have
13752
13790
  new_stacks.emplace_back(stack);
@@ -15220,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15220
15258
  if (imatrix_data) {
15221
15259
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15222
15260
  qs.has_imatrix = true;
15261
+ // check imatrix for nans or infs
15262
+ for (const auto & kv : *imatrix_data) {
15263
+ for (float f : kv.second) {
15264
+ if (!std::isfinite(f)) {
15265
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15266
+ }
15267
+ }
15268
+ }
15223
15269
  }
15224
15270
  }
15225
15271
 
@@ -16226,6 +16272,16 @@ struct llama_context * llama_new_context_with_model(
16226
16272
  ctx->backends.push_back(backend);
16227
16273
  }
16228
16274
  #endif
16275
+
16276
+ #ifdef GGML_USE_BLAS
16277
+ ctx->backend_blas = ggml_backend_blas_init();
16278
+ if (ctx->backend_blas == nullptr) {
16279
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16280
+ } else {
16281
+ ctx->backends.push_back(ctx->backend_blas);
16282
+ }
16283
+ #endif
16284
+
16229
16285
  #if defined(GGML_USE_RPC)
16230
16286
  if (model->n_gpu_layers > 0) {
16231
16287
  for (const auto & endpoint : model->rpc_servers) {
@@ -86,6 +86,7 @@ extern "C" {
86
86
  LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
87
  LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
88
88
  LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
89
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
89
90
  };
90
91
 
91
92
  // note: these values should be synchronized with ggml_rope
@@ -365,6 +366,9 @@ extern "C" {
365
366
  // modifies a preceding LLAMA_GRETYPE_CHAR or
366
367
  // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
367
368
  LLAMA_GRETYPE_CHAR_ALT = 6,
369
+
370
+ // any character (.)
371
+ LLAMA_GRETYPE_CHAR_ANY = 7,
368
372
  };
369
373
 
370
374
  typedef struct llama_grammar_element {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-06-08 00:00:00.000000000 Z
11
+ date: 2024-06-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -42,6 +42,8 @@ files:
42
42
  - vendor/tmp/llama.cpp/ggml-backend-impl.h
43
43
  - vendor/tmp/llama.cpp/ggml-backend.c
44
44
  - vendor/tmp/llama.cpp/ggml-backend.h
45
+ - vendor/tmp/llama.cpp/ggml-blas.cpp
46
+ - vendor/tmp/llama.cpp/ggml-blas.h
45
47
  - vendor/tmp/llama.cpp/ggml-common.h
46
48
  - vendor/tmp/llama.cpp/ggml-cuda.cu
47
49
  - vendor/tmp/llama.cpp/ggml-cuda.h
@@ -161,6 +163,16 @@ files:
161
163
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
162
164
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
163
165
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
166
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu
167
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu
168
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu
169
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu
170
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu
171
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu
172
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu
173
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu
174
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu
175
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu
164
176
  - vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu
165
177
  - vendor/tmp/llama.cpp/ggml-cuda/unary.cu
166
178
  - vendor/tmp/llama.cpp/ggml-cuda/upscale.cu
@@ -214,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
226
  - !ruby/object:Gem::Version
215
227
  version: '0'
216
228
  requirements: []
217
- rubygems_version: 3.5.10
229
+ rubygems_version: 3.5.9
218
230
  signing_key:
219
231
  specification_version: 4
220
232
  summary: Ruby bindings for the llama.cpp.