llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -21,6 +21,10 @@
21
21
  # include "ggml-kompute.h"
22
22
  #endif
23
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
24
28
  #ifdef GGML_USE_METAL
25
29
  # include "ggml-metal.h"
26
30
  #endif
@@ -704,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
704
708
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
705
709
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
706
710
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
711
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
707
712
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
708
713
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
709
714
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -2298,9 +2303,13 @@ struct llama_context {
2298
2303
  std::vector<ggml_backend_t> backends;
2299
2304
  #ifdef GGML_USE_METAL
2300
2305
  ggml_backend_t backend_metal = nullptr;
2306
+ #endif
2307
+ #ifdef GGML_USE_BLAS
2308
+ ggml_backend_t backend_blas = nullptr;
2301
2309
  #endif
2302
2310
  ggml_backend_t backend_cpu = nullptr;
2303
2311
 
2312
+
2304
2313
  const llama_model & model;
2305
2314
 
2306
2315
  // key + value cache for the self attention
@@ -4552,35 +4561,6 @@ static void llm_load_vocab(
4552
4561
  vocab.special_cls_id = -1;
4553
4562
  vocab.special_mask_id = -1;
4554
4563
 
4555
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4556
- // prior to support of FIM special tokens in GGUF, the following
4557
- // will allow those models to continue to work. The general names
4558
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4559
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4560
- // new versions of these models have been published.
4561
- std::string gen_name;
4562
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4563
-
4564
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4565
- [](unsigned char c){ return std::tolower(c); });
4566
-
4567
- if (gen_name.find("code") != std::string::npos) {
4568
- if (model.arch == LLM_ARCH_LLAMA) {
4569
- vocab.special_prefix_id = 32007;
4570
- vocab.special_suffix_id = 32008;
4571
- vocab.special_middle_id = 32009;
4572
- vocab.special_eot_id = 32010;
4573
- } else if (model.arch == LLM_ARCH_GEMMA) {
4574
- vocab.special_prefix_id = 67;
4575
- vocab.special_suffix_id = 69;
4576
- vocab.special_middle_id = 68;
4577
- // TODO: this is not EOT, it is "file separator" token, needs fix
4578
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4579
- //vocab.special_eot_id = 70;
4580
- vocab.special_eot_id = 107;
4581
- }
4582
- }
4583
-
4584
4564
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4585
4565
  if (add_space_prefix_keyidx != -1) {
4586
4566
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4653,8 +4633,7 @@ static void llm_load_vocab(
4653
4633
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4654
4634
  LLAMA_LOG_WARN("%s: \n", __func__);
4655
4635
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4656
- } else if (
4657
- tokenizer_pre == "default") {
4636
+ } else if (tokenizer_pre == "default") {
4658
4637
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4659
4638
  } else if (
4660
4639
  tokenizer_pre == "llama3" ||
@@ -4681,7 +4660,8 @@ static void llm_load_vocab(
4681
4660
  tokenizer_pre == "jina-es" ||
4682
4661
  tokenizer_pre == "jina-de" ||
4683
4662
  tokenizer_pre == "jina-v2-es" ||
4684
- tokenizer_pre == "jina-v2-de") {
4663
+ tokenizer_pre == "jina-v2-de" ||
4664
+ tokenizer_pre == "jina-v2-code") {
4685
4665
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4686
4666
  } else if (
4687
4667
  tokenizer_pre == "refact") {
@@ -4704,6 +4684,9 @@ static void llm_load_vocab(
4704
4684
  } else if (
4705
4685
  tokenizer_pre == "smaug-bpe") {
4706
4686
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4687
+ } else if (
4688
+ tokenizer_pre == "poro-chat") {
4689
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4707
4690
  } else {
4708
4691
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4709
4692
  }
@@ -4761,6 +4744,45 @@ static void llm_load_vocab(
4761
4744
 
4762
4745
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4763
4746
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4748
+ // prior to support of FIM special tokens in GGUF, the following
4749
+ // will allow those models to continue to work. The general names
4750
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752
+ // new versions of these models have been published.
4753
+ std::string gen_name;
4754
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755
+
4756
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757
+ [](unsigned char c){ return std::tolower(c); });
4758
+
4759
+ if (gen_name.find("code") != std::string::npos) {
4760
+ if (model.arch == LLM_ARCH_LLAMA
4761
+ && 32010 < vocab.id_to_token.size()
4762
+ && vocab.id_to_token[32007].text == "<PRE>"
4763
+ && vocab.id_to_token[32008].text == "<SUF>"
4764
+ && vocab.id_to_token[32009].text == "<MID>"
4765
+ && vocab.id_to_token[32010].text == "<EOT>") {
4766
+ vocab.special_prefix_id = 32007;
4767
+ vocab.special_suffix_id = 32008;
4768
+ vocab.special_middle_id = 32009;
4769
+ vocab.special_eot_id = 32010;
4770
+ } else if (model.arch == LLM_ARCH_GEMMA
4771
+ && 107 < vocab.id_to_token.size()
4772
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4773
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4774
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4775
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4776
+ vocab.special_prefix_id = 67;
4777
+ vocab.special_suffix_id = 69;
4778
+ vocab.special_middle_id = 68;
4779
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4780
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781
+ //vocab.special_eot_id = 70;
4782
+ vocab.special_eot_id = 107;
4783
+ }
4784
+ }
4785
+
4764
4786
  try {
4765
4787
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4766
4788
  } catch (const std::exception & e) {
@@ -5515,7 +5537,7 @@ static bool llm_load_tensors(
5515
5537
 
5516
5538
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5517
5539
  } else {
5518
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5540
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5519
5541
  }
5520
5542
 
5521
5543
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5556,6 +5578,9 @@ static bool llm_load_tensors(
5556
5578
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5557
5579
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5558
5580
 
5581
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5582
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5583
+
5559
5584
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5560
5585
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5561
5586
 
@@ -8519,6 +8544,11 @@ struct llm_build_context {
8519
8544
  // attention layer norm
8520
8545
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8521
8546
 
8547
+ if (model.layers[il].attn_norm_2 != nullptr) {
8548
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8549
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8550
+ }
8551
+
8522
8552
  struct ggml_tensor * ffn_inp = cur;
8523
8553
  cb(ffn_inp, "ffn_inp", il);
8524
8554
 
@@ -11520,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
11520
11550
  if (batch.n_tokens < 32 || full_offload) {
11521
11551
  if (il != -1 && strcmp(name, "norm") == 0) {
11522
11552
  for (auto * backend : lctx.backends) {
11523
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11553
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11554
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11524
11555
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11525
11556
  break;
11526
11557
  }
@@ -12017,6 +12048,11 @@ static void llama_graph_compute(
12017
12048
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
12018
12049
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
12019
12050
  }
12051
+ #ifdef GGML_USE_BLAS
12052
+ if (lctx.backend_blas != nullptr) {
12053
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12054
+ }
12055
+ #endif
12020
12056
 
12021
12057
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
12022
12058
 
@@ -12239,17 +12275,6 @@ static int llama_decode_internal(
12239
12275
  }
12240
12276
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12241
12277
 
12242
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12243
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12244
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12245
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12246
- // with the BLAS calls. need a better solution
12247
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12248
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12249
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12250
- n_threads = std::min(4, n_threads);
12251
- }
12252
-
12253
12278
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12254
12279
 
12255
12280
  llama_set_inputs(lctx, u_batch);
@@ -13016,6 +13041,11 @@ struct llm_tokenizer_bpe {
13016
13041
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13017
13042
  });
13018
13043
  break;
13044
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
+ word_collection = unicode_regex_split(text, {
13046
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
+ });
13048
+ break;
13019
13049
  default:
13020
13050
  // default regex for BPE tokenization pre-processing
13021
13051
  word_collection = unicode_regex_split(text, {
@@ -13631,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13631
13661
  const uint32_t chr) {
13632
13662
 
13633
13663
  bool found = false;
13634
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13664
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13635
13665
 
13636
13666
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13637
13667
 
@@ -13640,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13640
13670
  // inclusive range, e.g. [a-z]
13641
13671
  found = found || (pos->value <= chr && chr <= pos[1].value);
13642
13672
  pos += 2;
13673
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13674
+ // Any character matches "."
13675
+ found = true;
13676
+ pos += 1;
13643
13677
  } else {
13644
13678
  // exact char match, e.g. [a] or "a"
13645
13679
  found = found || pos->value == chr;
@@ -13657,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
13657
13691
  const llama_grammar_element * pos,
13658
13692
  const llama_partial_utf8 partial_utf8) {
13659
13693
 
13660
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13694
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13661
13695
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13662
13696
 
13663
13697
  uint32_t partial_value = partial_utf8.value;
@@ -13687,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
13687
13721
  return is_positive_char;
13688
13722
  }
13689
13723
  pos += 2;
13724
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13725
+ // Any character matches "."
13726
+ return true;
13690
13727
  } else {
13691
13728
  // exact char match, e.g. [a] or "a"
13692
13729
  if (low <= pos->value && pos->value <= high) {
@@ -13747,6 +13784,7 @@ static void llama_grammar_advance_stack(
13747
13784
  }
13748
13785
  case LLAMA_GRETYPE_CHAR:
13749
13786
  case LLAMA_GRETYPE_CHAR_NOT:
13787
+ case LLAMA_GRETYPE_CHAR_ANY:
13750
13788
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13751
13789
  // only add the stack if it's not a duplicate of one we already have
13752
13790
  new_stacks.emplace_back(stack);
@@ -15220,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15220
15258
  if (imatrix_data) {
15221
15259
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15222
15260
  qs.has_imatrix = true;
15261
+ // check imatrix for nans or infs
15262
+ for (const auto & kv : *imatrix_data) {
15263
+ for (float f : kv.second) {
15264
+ if (!std::isfinite(f)) {
15265
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15266
+ }
15267
+ }
15268
+ }
15223
15269
  }
15224
15270
  }
15225
15271
 
@@ -16226,6 +16272,16 @@ struct llama_context * llama_new_context_with_model(
16226
16272
  ctx->backends.push_back(backend);
16227
16273
  }
16228
16274
  #endif
16275
+
16276
+ #ifdef GGML_USE_BLAS
16277
+ ctx->backend_blas = ggml_backend_blas_init();
16278
+ if (ctx->backend_blas == nullptr) {
16279
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16280
+ } else {
16281
+ ctx->backends.push_back(ctx->backend_blas);
16282
+ }
16283
+ #endif
16284
+
16229
16285
  #if defined(GGML_USE_RPC)
16230
16286
  if (model->n_gpu_layers > 0) {
16231
16287
  for (const auto & endpoint : model->rpc_servers) {
@@ -86,6 +86,7 @@ extern "C" {
86
86
  LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
87
  LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
88
88
  LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
89
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
89
90
  };
90
91
 
91
92
  // note: these values should be synchronized with ggml_rope
@@ -365,6 +366,9 @@ extern "C" {
365
366
  // modifies a preceding LLAMA_GRETYPE_CHAR or
366
367
  // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
367
368
  LLAMA_GRETYPE_CHAR_ALT = 6,
369
+
370
+ // any character (.)
371
+ LLAMA_GRETYPE_CHAR_ANY = 7,
368
372
  };
369
373
 
370
374
  typedef struct llama_grammar_element {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-06-08 00:00:00.000000000 Z
11
+ date: 2024-06-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -42,6 +42,8 @@ files:
42
42
  - vendor/tmp/llama.cpp/ggml-backend-impl.h
43
43
  - vendor/tmp/llama.cpp/ggml-backend.c
44
44
  - vendor/tmp/llama.cpp/ggml-backend.h
45
+ - vendor/tmp/llama.cpp/ggml-blas.cpp
46
+ - vendor/tmp/llama.cpp/ggml-blas.h
45
47
  - vendor/tmp/llama.cpp/ggml-common.h
46
48
  - vendor/tmp/llama.cpp/ggml-cuda.cu
47
49
  - vendor/tmp/llama.cpp/ggml-cuda.h
@@ -161,6 +163,16 @@ files:
161
163
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
162
164
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
163
165
  - vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
166
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu
167
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu
168
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu
169
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu
170
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu
171
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu
172
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu
173
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu
174
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu
175
+ - vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu
164
176
  - vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu
165
177
  - vendor/tmp/llama.cpp/ggml-cuda/unary.cu
166
178
  - vendor/tmp/llama.cpp/ggml-cuda/upscale.cu
@@ -214,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
226
  - !ruby/object:Gem::Version
215
227
  version: '0'
216
228
  requirements: []
217
- rubygems_version: 3.5.10
229
+ rubygems_version: 3.5.9
218
230
  signing_key:
219
231
  specification_version: 4
220
232
  summary: Ruby bindings for the llama.cpp.