llama_cpp 0.15.4 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -23,6 +21,10 @@
23
21
  # include "ggml-kompute.h"
24
22
  #endif
25
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
26
28
  #ifdef GGML_USE_METAL
27
29
  # include "ggml-metal.h"
28
30
  #endif
@@ -110,7 +112,7 @@
110
112
  //
111
113
 
112
114
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
115
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
116
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
117
 
116
118
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -706,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
706
708
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
707
709
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
708
710
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
711
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
709
712
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
710
713
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
711
714
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1850,7 +1853,7 @@ struct llama_hparams {
1850
1853
  float rope_attn_factor = 1.0f;
1851
1854
  float rope_freq_base_train;
1852
1855
  float rope_freq_scale_train;
1853
- uint32_t n_yarn_orig_ctx;
1856
+ uint32_t n_ctx_orig_yarn;
1854
1857
  float rope_yarn_log_mul;
1855
1858
 
1856
1859
  // for State Space Models
@@ -1892,7 +1895,7 @@ struct llama_hparams {
1892
1895
  if (this->n_expert_shared != other.n_expert_shared) return true;
1893
1896
 
1894
1897
  if (this->rope_finetuned != other.rope_finetuned) return true;
1895
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1898
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1896
1899
 
1897
1900
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1898
1901
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1951,7 +1954,7 @@ struct llama_cparams {
1951
1954
  float rope_freq_base;
1952
1955
  float rope_freq_scale;
1953
1956
 
1954
- uint32_t n_yarn_orig_ctx;
1957
+ uint32_t n_ctx_orig_yarn;
1955
1958
  // These hyperparameters are not exposed in GGUF, because all
1956
1959
  // existing YaRN models use the same values for them.
1957
1960
  float yarn_ext_factor;
@@ -2149,12 +2152,12 @@ struct llama_control_vector {
2149
2152
  struct llama_vocab {
2150
2153
  using id = int32_t;
2151
2154
  using token = std::string;
2152
- using ttype = llama_token_type;
2155
+ using tattr = llama_token_attr;
2153
2156
 
2154
2157
  struct token_data {
2155
2158
  token text;
2156
2159
  float score;
2157
- ttype type;
2160
+ tattr attr;
2158
2161
  };
2159
2162
 
2160
2163
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2164,8 +2167,7 @@ struct llama_vocab {
2164
2167
  std::vector<token_data> id_to_token;
2165
2168
 
2166
2169
  std::vector<id> cache_special_tokens;
2167
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
- std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2170
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2169
2171
 
2170
2172
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2171
2173
 
@@ -2301,9 +2303,13 @@ struct llama_context {
2301
2303
  std::vector<ggml_backend_t> backends;
2302
2304
  #ifdef GGML_USE_METAL
2303
2305
  ggml_backend_t backend_metal = nullptr;
2306
+ #endif
2307
+ #ifdef GGML_USE_BLAS
2308
+ ggml_backend_t backend_blas = nullptr;
2304
2309
  #endif
2305
2310
  ggml_backend_t backend_cpu = nullptr;
2306
2311
 
2312
+
2307
2313
  const llama_model & model;
2308
2314
 
2309
2315
  // key + value cache for the self attention
@@ -2372,13 +2378,34 @@ struct llama_context {
2372
2378
  struct llama_control_vector cvec;
2373
2379
  };
2374
2380
 
2381
+ static size_t llama_get_device_count(const llama_model & model) {
2382
+ size_t count = 1;
2383
+ #if defined(GGML_USE_CUDA)
2384
+ count = ggml_backend_cuda_get_device_count();
2385
+ #elif defined(GGML_USE_SYCL)
2386
+ count = ggml_backend_sycl_get_device_count();
2387
+ #elif defined(GGML_USE_VULKAN)
2388
+ count = ggml_backend_vk_get_device_count();
2389
+ #endif
2390
+ #if defined(GGML_USE_RPC)
2391
+ count += model.rpc_servers.size();
2392
+ #endif
2393
+ return count;
2394
+ GGML_UNUSED(model);
2395
+ }
2396
+
2375
2397
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2376
2398
  ggml_backend_buffer_type_t buft = nullptr;
2377
2399
 
2378
- #ifdef GGML_USE_RPC
2379
- std::string endpoint = model.rpc_servers[gpu];
2380
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2381
- #elif defined(GGML_USE_METAL)
2400
+ #if defined(GGML_USE_RPC)
2401
+ int dev_count = (int)llama_get_device_count(model);
2402
+ int rpc_count = (int)model.rpc_servers.size();
2403
+ if (gpu >= dev_count - rpc_count) {
2404
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2405
+ return ggml_backend_rpc_buffer_type(endpoint);
2406
+ }
2407
+ #endif
2408
+ #if defined(GGML_USE_METAL)
2382
2409
  buft = ggml_backend_metal_buffer_type();
2383
2410
  #elif defined(GGML_USE_CUDA)
2384
2411
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2386,8 +2413,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2386
2413
  buft = ggml_backend_vk_buffer_type(gpu);
2387
2414
  #elif defined(GGML_USE_SYCL)
2388
2415
  buft = ggml_backend_sycl_buffer_type(gpu);
2389
- #elif defined(GGML_USE_CLBLAST)
2390
- buft = ggml_backend_opencl_buffer_type();
2391
2416
  #elif defined(GGML_USE_KOMPUTE)
2392
2417
  buft = ggml_backend_kompute_buffer_type(gpu);
2393
2418
  if (buft == nullptr) {
@@ -2426,29 +2451,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2426
2451
  GGML_UNUSED(tensor_split);
2427
2452
  }
2428
2453
 
2429
- static size_t llama_get_device_count(const llama_model & model) {
2430
- #if defined(GGML_USE_RPC)
2431
- return model.rpc_servers.size();
2432
- #elif defined(GGML_USE_CUDA)
2433
- return ggml_backend_cuda_get_device_count();
2434
- #elif defined(GGML_USE_SYCL)
2435
- return ggml_backend_sycl_get_device_count();
2436
- #elif defined(GGML_USE_VULKAN)
2437
- return ggml_backend_vk_get_device_count();
2438
- #else
2439
- return 1;
2440
- #endif
2441
- GGML_UNUSED(model);
2442
- }
2443
-
2444
2454
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2445
2455
  #if defined(GGML_USE_RPC)
2446
- size_t total;
2447
- size_t free;
2448
- std::string endpoint = model.rpc_servers[device];
2449
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2450
- return free;
2451
- #elif defined(GGML_USE_CUDA)
2456
+ int dev_count = (int)llama_get_device_count(model);
2457
+ int rpc_count = (int)model.rpc_servers.size();
2458
+ if (device >= dev_count - rpc_count) {
2459
+ size_t total;
2460
+ size_t free;
2461
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2462
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2463
+ return free;
2464
+ }
2465
+ #endif
2466
+ #if defined(GGML_USE_CUDA)
2452
2467
  size_t total;
2453
2468
  size_t free;
2454
2469
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2520,10 +2535,6 @@ static bool llama_kv_cache_init(
2520
2535
  }
2521
2536
  }
2522
2537
 
2523
- #ifdef GGML_USE_CLBLAST
2524
- offload = false;
2525
- #endif
2526
-
2527
2538
  // count used buffer types
2528
2539
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2529
2540
  if (offload) {
@@ -4003,8 +4014,8 @@ static void llm_load_hparams(
4003
4014
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
4004
4015
  hparams.rope_finetuned = rope_finetuned;
4005
4016
 
4006
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
4007
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4017
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4018
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
4008
4019
 
4009
4020
  // rope_freq_base (optional)
4010
4021
  hparams.rope_freq_base_train = 10000.0f;
@@ -4550,35 +4561,6 @@ static void llm_load_vocab(
4550
4561
  vocab.special_cls_id = -1;
4551
4562
  vocab.special_mask_id = -1;
4552
4563
 
4553
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4554
- // prior to support of FIM special tokens in GGUF, the following
4555
- // will allow those models to continue to work. The general names
4556
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4557
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4558
- // new versions of these models have been published.
4559
- std::string gen_name;
4560
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4561
-
4562
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4563
- [](unsigned char c){ return std::tolower(c); });
4564
-
4565
- if (gen_name.find("code") != std::string::npos) {
4566
- if (model.arch == LLM_ARCH_LLAMA) {
4567
- vocab.special_prefix_id = 32007;
4568
- vocab.special_suffix_id = 32008;
4569
- vocab.special_middle_id = 32009;
4570
- vocab.special_eot_id = 32010;
4571
- } else if (model.arch == LLM_ARCH_GEMMA) {
4572
- vocab.special_prefix_id = 67;
4573
- vocab.special_suffix_id = 69;
4574
- vocab.special_middle_id = 68;
4575
- // TODO: this is not EOT, it is "file separator" token, needs fix
4576
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4577
- //vocab.special_eot_id = 70;
4578
- vocab.special_eot_id = 107;
4579
- }
4580
- }
4581
-
4582
4564
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4583
4565
  if (add_space_prefix_keyidx != -1) {
4584
4566
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4651,8 +4633,7 @@ static void llm_load_vocab(
4651
4633
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4652
4634
  LLAMA_LOG_WARN("%s: \n", __func__);
4653
4635
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4654
- } else if (
4655
- tokenizer_pre == "default") {
4636
+ } else if (tokenizer_pre == "default") {
4656
4637
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4657
4638
  } else if (
4658
4639
  tokenizer_pre == "llama3" ||
@@ -4679,7 +4660,8 @@ static void llm_load_vocab(
4679
4660
  tokenizer_pre == "jina-es" ||
4680
4661
  tokenizer_pre == "jina-de" ||
4681
4662
  tokenizer_pre == "jina-v2-es" ||
4682
- tokenizer_pre == "jina-v2-de") {
4663
+ tokenizer_pre == "jina-v2-de" ||
4664
+ tokenizer_pre == "jina-v2-code") {
4683
4665
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4684
4666
  } else if (
4685
4667
  tokenizer_pre == "refact") {
@@ -4702,6 +4684,9 @@ static void llm_load_vocab(
4702
4684
  } else if (
4703
4685
  tokenizer_pre == "smaug-bpe") {
4704
4686
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4687
+ } else if (
4688
+ tokenizer_pre == "poro-chat") {
4689
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4705
4690
  } else {
4706
4691
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4707
4692
  }
@@ -4740,12 +4725,64 @@ static void llm_load_vocab(
4740
4725
  auto & token_data = vocab.id_to_token[i];
4741
4726
  token_data.text = std::move(word);
4742
4727
  token_data.score = scores ? scores[i] : 0.0f;
4743
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4728
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4729
+
4730
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4731
+ switch(toktypes[i]) {
4732
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4733
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4734
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4735
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4736
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4737
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4738
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4739
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4740
+ }
4741
+ }
4744
4742
  }
4745
4743
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4746
4744
 
4747
4745
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4748
4746
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4748
+ // prior to support of FIM special tokens in GGUF, the following
4749
+ // will allow those models to continue to work. The general names
4750
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752
+ // new versions of these models have been published.
4753
+ std::string gen_name;
4754
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755
+
4756
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757
+ [](unsigned char c){ return std::tolower(c); });
4758
+
4759
+ if (gen_name.find("code") != std::string::npos) {
4760
+ if (model.arch == LLM_ARCH_LLAMA
4761
+ && 32010 < vocab.id_to_token.size()
4762
+ && vocab.id_to_token[32007].text == "<PRE>"
4763
+ && vocab.id_to_token[32008].text == "<SUF>"
4764
+ && vocab.id_to_token[32009].text == "<MID>"
4765
+ && vocab.id_to_token[32010].text == "<EOT>") {
4766
+ vocab.special_prefix_id = 32007;
4767
+ vocab.special_suffix_id = 32008;
4768
+ vocab.special_middle_id = 32009;
4769
+ vocab.special_eot_id = 32010;
4770
+ } else if (model.arch == LLM_ARCH_GEMMA
4771
+ && 107 < vocab.id_to_token.size()
4772
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4773
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4774
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4775
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4776
+ vocab.special_prefix_id = 67;
4777
+ vocab.special_suffix_id = 69;
4778
+ vocab.special_middle_id = 68;
4779
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4780
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781
+ //vocab.special_eot_id = 70;
4782
+ vocab.special_eot_id = 107;
4783
+ }
4784
+ }
4785
+
4749
4786
  try {
4750
4787
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4751
4788
  } catch (const std::exception & e) {
@@ -4831,7 +4868,7 @@ static void llm_load_vocab(
4831
4868
  // build special tokens cache
4832
4869
  {
4833
4870
  for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4834
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4871
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4835
4872
  vocab.cache_special_tokens.push_back(id);
4836
4873
  }
4837
4874
  }
@@ -4845,26 +4882,75 @@ static void llm_load_vocab(
4845
4882
  LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
4883
  }
4847
4884
 
4848
- // build token to piece caches
4885
+ // build token to piece cache
4849
4886
  {
4850
4887
  size_t size_cache = 0;
4851
4888
 
4852
- std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
- std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4889
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4854
4890
 
4855
4891
  for (uint32_t id = 0; id < n_vocab; ++id) {
4856
- cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
- cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4892
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4858
4893
 
4859
4894
  size_cache += cache_token_to_piece[id].size();
4860
- size_cache += cache_token_to_piece_special[id].size();
4861
4895
  }
4862
4896
 
4863
- std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
- std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4897
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4865
4898
 
4866
4899
  LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4867
4900
  }
4901
+
4902
+ // Handle per token attributes
4903
+ //NOTE: Each model customizes per token attributes.
4904
+ //NOTE: Per token attributes are missing from the GGUF file.
4905
+ //TODO: Extract attributes from GGUF file.
4906
+ {
4907
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4908
+ for (auto substr : substrs) {
4909
+ if (str.find(substr) < std::string::npos) {
4910
+ return true;
4911
+ }
4912
+ }
4913
+ return false;
4914
+ };
4915
+
4916
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4917
+ uint32_t current = vocab.id_to_token.at(id).attr;
4918
+ current = value ? (current | attr) : (current & ~attr);
4919
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4920
+ };
4921
+
4922
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4923
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4924
+ };
4925
+
4926
+ std::string model_name;
4927
+ std::string tokenizer_pre;
4928
+
4929
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4930
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4931
+
4932
+ // model name to lowercase
4933
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4934
+ [] (const std::string::value_type x) {
4935
+ return std::tolower(x);
4936
+ }
4937
+ );
4938
+
4939
+ // set attributes by model/tokenizer name
4940
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4941
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4942
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4943
+ for (auto id : vocab.cache_special_tokens) {
4944
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4945
+ }
4946
+ for (auto token : {"</s>"}) {
4947
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4948
+ }
4949
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4950
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4951
+ }
4952
+ }
4953
+ }
4868
4954
  }
4869
4955
 
4870
4956
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -4904,7 +4990,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4904
4990
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4905
4991
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4906
4992
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4907
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4993
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4908
4994
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4909
4995
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4910
4996
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -5129,12 +5215,10 @@ static bool llm_load_tensors(
5129
5215
  // output
5130
5216
  {
5131
5217
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5132
- if (model.arch != LLM_ARCH_MINICPM){
5133
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5134
- // if output is NULL, init from the input tok embed
5135
- if (model.output == NULL) {
5136
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5137
- }
5218
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5219
+ // if output is NULL, init from the input tok embed
5220
+ if (model.output == NULL) {
5221
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5138
5222
  }
5139
5223
  }
5140
5224
 
@@ -5453,7 +5537,7 @@ static bool llm_load_tensors(
5453
5537
 
5454
5538
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5455
5539
  } else {
5456
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5540
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5457
5541
  }
5458
5542
 
5459
5543
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5494,6 +5578,9 @@ static bool llm_load_tensors(
5494
5578
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5495
5579
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5496
5580
 
5581
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5582
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5583
+
5497
5584
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5498
5585
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5499
5586
 
@@ -7072,7 +7159,7 @@ struct llm_build_context {
7072
7159
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
7073
7160
  const int32_t n_outputs;
7074
7161
  const int32_t kv_head; // index of where we store new KV data in the cache
7075
- const int32_t n_orig_ctx;
7162
+ const int32_t n_ctx_orig;
7076
7163
 
7077
7164
  const bool flash_attn;
7078
7165
 
@@ -7121,7 +7208,7 @@ struct llm_build_context {
7121
7208
  n_kv (worst_case ? kv_self.size : kv_self.n),
7122
7209
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
7123
7210
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
7124
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7211
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
7125
7212
  flash_attn (cparams.flash_attn),
7126
7213
  pooling_type (cparams.pooling_type),
7127
7214
  rope_type (hparams.rope_type),
@@ -7179,7 +7266,7 @@ struct llm_build_context {
7179
7266
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7180
7267
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7181
7268
  0),
7182
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7269
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7183
7270
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
7271
 
7185
7272
  cb(tmp, "K_shifted", il);
@@ -7288,7 +7375,7 @@ struct llm_build_context {
7288
7375
  // choose long/short freq factors based on the context size
7289
7376
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
7377
 
7291
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7378
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7292
7379
  return model.layers[il].rope_long;
7293
7380
  }
7294
7381
 
@@ -7404,14 +7491,14 @@ struct llm_build_context {
7404
7491
 
7405
7492
  Qcur = ggml_rope_ext(
7406
7493
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7494
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7495
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7496
  );
7410
7497
  cb(Qcur, "Qcur", il);
7411
7498
 
7412
7499
  Kcur = ggml_rope_ext(
7413
7500
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7414
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7501
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7415
7502
  ext_factor, attn_factor, beta_fast, beta_slow
7416
7503
  );
7417
7504
  cb(Kcur, "Kcur", il);
@@ -7535,12 +7622,12 @@ struct llm_build_context {
7535
7622
  case MODEL_7B:
7536
7623
  Qcur = ggml_rope_ext(
7537
7624
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7538
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7625
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7539
7626
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7627
  );
7541
7628
  Kcur = ggml_rope_ext(
7542
7629
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7543
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7630
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7544
7631
  ext_factor, attn_factor, beta_fast, beta_slow
7545
7632
  );
7546
7633
  break;
@@ -7647,14 +7734,14 @@ struct llm_build_context {
7647
7734
 
7648
7735
  Qcur = ggml_rope_ext(
7649
7736
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7650
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7737
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7651
7738
  ext_factor, attn_factor, beta_fast, beta_slow
7652
7739
  );
7653
7740
  cb(Qcur, "Qcur", il);
7654
7741
 
7655
7742
  Kcur = ggml_rope_ext(
7656
7743
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7657
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7744
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7658
7745
  ext_factor, attn_factor, beta_fast, beta_slow
7659
7746
  );
7660
7747
  cb(Kcur, "Kcur", il);
@@ -7767,13 +7854,13 @@ struct llm_build_context {
7767
7854
 
7768
7855
  // using mode = 2 for neox mode
7769
7856
  Qcur = ggml_rope_ext(
7770
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7857
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7771
7858
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7772
7859
  );
7773
7860
  cb(Qcur, "Qcur", il);
7774
7861
 
7775
7862
  Kcur = ggml_rope_ext(
7776
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7863
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7777
7864
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7778
7865
  );
7779
7866
  cb(Kcur, "Kcur", il);
@@ -7891,14 +7978,14 @@ struct llm_build_context {
7891
7978
 
7892
7979
  Qcur = ggml_rope_ext(
7893
7980
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7894
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7981
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7895
7982
  ext_factor, attn_factor, beta_fast, beta_slow
7896
7983
  );
7897
7984
  cb(Qcur, "Qcur", il);
7898
7985
 
7899
7986
  Kcur = ggml_rope_ext(
7900
7987
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7988
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7902
7989
  ext_factor, attn_factor, beta_fast, beta_slow
7903
7990
  );
7904
7991
  cb(Kcur, "Kcur", il);
@@ -8044,14 +8131,14 @@ struct llm_build_context {
8044
8131
 
8045
8132
  Qcur = ggml_rope_ext(
8046
8133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8047
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8048
8135
  ext_factor, attn_factor, beta_fast, beta_slow
8049
8136
  );
8050
8137
  cb(Qcur, "Qcur", il);
8051
8138
 
8052
8139
  Kcur = ggml_rope_ext(
8053
8140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8054
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8055
8142
  ext_factor, attn_factor, beta_fast, beta_slow
8056
8143
  );
8057
8144
  cb(Kcur, "Kcur", il);
@@ -8398,14 +8485,14 @@ struct llm_build_context {
8398
8485
 
8399
8486
  Qcur = ggml_rope_ext(
8400
8487
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8401
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8488
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8402
8489
  ext_factor, attn_factor, beta_fast, beta_slow
8403
8490
  );
8404
8491
  cb(Qcur, "Qcur", il);
8405
8492
 
8406
8493
  Kcur = ggml_rope_ext(
8407
8494
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8408
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8495
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8409
8496
  ext_factor, attn_factor, beta_fast, beta_slow
8410
8497
  );
8411
8498
  cb(Kcur, "Kcur", il);
@@ -8457,6 +8544,11 @@ struct llm_build_context {
8457
8544
  // attention layer norm
8458
8545
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8459
8546
 
8547
+ if (model.layers[il].attn_norm_2 != nullptr) {
8548
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8549
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8550
+ }
8551
+
8460
8552
  struct ggml_tensor * ffn_inp = cur;
8461
8553
  cb(ffn_inp, "ffn_inp", il);
8462
8554
 
@@ -8838,14 +8930,14 @@ struct llm_build_context {
8838
8930
 
8839
8931
  Qcur = ggml_rope_ext(
8840
8932
  ctx0, Qcur, inp_pos, nullptr,
8841
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8933
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8842
8934
  ext_factor, attn_factor, beta_fast, beta_slow
8843
8935
  );
8844
8936
  cb(Qcur, "Qcur", il);
8845
8937
 
8846
8938
  Kcur = ggml_rope_ext(
8847
8939
  ctx0, Kcur, inp_pos, nullptr,
8848
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8940
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8849
8941
  ext_factor, attn_factor, beta_fast, beta_slow
8850
8942
  );
8851
8943
  cb(Kcur, "Kcur", il);
@@ -8957,13 +9049,13 @@ struct llm_build_context {
8957
9049
 
8958
9050
  // using mode = 2 for neox mode
8959
9051
  Qcur = ggml_rope_ext(
8960
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9052
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8961
9053
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8962
9054
  );
8963
9055
  cb(Qcur, "Qcur", il);
8964
9056
 
8965
9057
  Kcur = ggml_rope_ext(
8966
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9058
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8967
9059
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8968
9060
  );
8969
9061
  cb(Kcur, "Kcur", il);
@@ -9069,14 +9161,14 @@ struct llm_build_context {
9069
9161
 
9070
9162
  Qcur = ggml_rope_ext(
9071
9163
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9072
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9164
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9073
9165
  ext_factor, attn_factor, beta_fast, beta_slow
9074
9166
  );
9075
9167
  cb(Qcur, "Qcur", il);
9076
9168
 
9077
9169
  Kcur = ggml_rope_ext(
9078
9170
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9079
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9171
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9080
9172
  ext_factor, attn_factor, beta_fast, beta_slow
9081
9173
  );
9082
9174
  cb(Kcur, "Kcur", il);
@@ -9183,14 +9275,14 @@ struct llm_build_context {
9183
9275
 
9184
9276
  Qcur = ggml_rope_ext(
9185
9277
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9186
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9278
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9187
9279
  ext_factor, attn_factor, beta_fast, beta_slow
9188
9280
  );
9189
9281
  cb(Qcur, "Qcur", il);
9190
9282
 
9191
9283
  Kcur = ggml_rope_ext(
9192
9284
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9193
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9285
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9194
9286
  ext_factor, attn_factor, beta_fast, beta_slow
9195
9287
  );
9196
9288
  cb(Kcur, "Kcur", il);
@@ -9335,7 +9427,7 @@ struct llm_build_context {
9335
9427
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9336
9428
 
9337
9429
  Qcur = ggml_rope_ext(
9338
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9430
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9339
9431
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9340
9432
  );
9341
9433
  cb(Qcur, "Qcur", il);
@@ -9346,7 +9438,7 @@ struct llm_build_context {
9346
9438
  cb(Qcur, "Qcur", il);
9347
9439
 
9348
9440
  Kcur = ggml_rope_ext(
9349
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9441
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9350
9442
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9351
9443
  );
9352
9444
  cb(Kcur, "Kcur", il);
@@ -9457,7 +9549,7 @@ struct llm_build_context {
9457
9549
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9458
9550
 
9459
9551
  Qcur = ggml_rope_ext(
9460
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9552
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9461
9553
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9462
9554
  );
9463
9555
  cb(Qcur, "Qcur", il);
@@ -9466,7 +9558,7 @@ struct llm_build_context {
9466
9558
  cb(Qcur, "Qcur", il);
9467
9559
 
9468
9560
  Kcur = ggml_rope_ext(
9469
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9561
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9470
9562
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9471
9563
  );
9472
9564
  cb(Kcur, "Kcur", il);
@@ -9574,13 +9666,13 @@ struct llm_build_context {
9574
9666
 
9575
9667
  Qcur = ggml_rope_ext(
9576
9668
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9577
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9669
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9578
9670
  ext_factor, attn_factor, beta_fast, beta_slow);
9579
9671
  cb(Qcur, "Qcur", il);
9580
9672
 
9581
9673
  Kcur = ggml_rope_ext(
9582
9674
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9583
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9675
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9584
9676
  ext_factor, attn_factor, beta_fast, beta_slow);
9585
9677
  cb(Kcur, "Kcur", il);
9586
9678
 
@@ -9782,14 +9874,14 @@ struct llm_build_context {
9782
9874
 
9783
9875
  struct ggml_tensor * Qcur = ggml_rope_ext(
9784
9876
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9785
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9877
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9786
9878
  ext_factor, attn_factor, beta_fast, beta_slow
9787
9879
  );
9788
9880
  cb(Qcur, "Qcur", il);
9789
9881
 
9790
9882
  struct ggml_tensor * Kcur = ggml_rope_ext(
9791
9883
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9792
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9884
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9793
9885
  ext_factor, attn_factor, beta_fast, beta_slow
9794
9886
  );
9795
9887
  cb(Kcur, "Kcur", il);
@@ -9898,14 +9990,14 @@ struct llm_build_context {
9898
9990
 
9899
9991
  Qcur = ggml_rope_ext(
9900
9992
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9993
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9902
9994
  ext_factor, attn_factor, beta_fast, beta_slow
9903
9995
  );
9904
9996
  cb(Qcur, "Qcur", il);
9905
9997
 
9906
9998
  Kcur = ggml_rope_ext(
9907
9999
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9908
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10000
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9909
10001
  ext_factor, attn_factor, beta_fast, beta_slow
9910
10002
  );
9911
10003
  cb(Kcur, "Kcur", il);
@@ -10015,14 +10107,14 @@ struct llm_build_context {
10015
10107
 
10016
10108
  Qcur = ggml_rope_ext(
10017
10109
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10018
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10110
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10019
10111
  ext_factor, attn_factor, beta_fast, beta_slow
10020
10112
  );
10021
10113
  cb(Qcur, "Qcur", il);
10022
10114
 
10023
10115
  Kcur = ggml_rope_ext(
10024
10116
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10025
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10117
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10026
10118
  ext_factor, attn_factor, beta_fast, beta_slow
10027
10119
  );
10028
10120
  cb(Kcur, "Kcur", il);
@@ -10145,14 +10237,14 @@ struct llm_build_context {
10145
10237
 
10146
10238
  Qcur = ggml_rope_ext(
10147
10239
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10148
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10240
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10149
10241
  ext_factor, attn_factor, beta_fast, beta_slow
10150
10242
  );
10151
10243
  cb(Qcur, "Qcur", il);
10152
10244
 
10153
10245
  Kcur = ggml_rope_ext(
10154
10246
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10155
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10247
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10156
10248
  ext_factor, attn_factor, beta_fast, beta_slow
10157
10249
  );
10158
10250
  cb(Kcur, "Kcur", il);
@@ -10217,7 +10309,7 @@ struct llm_build_context {
10217
10309
  cb(cur, "lmhead_scaling", -1);
10218
10310
 
10219
10311
  // lm_head
10220
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10312
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10221
10313
  cb(cur, "result_output", -1);
10222
10314
 
10223
10315
  ggml_build_forward_expand(gf, cur);
@@ -10265,7 +10357,7 @@ struct llm_build_context {
10265
10357
 
10266
10358
  Qcur = ggml_rope_ext(
10267
10359
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10268
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10360
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10269
10361
  ext_factor, attn_factor, beta_fast, beta_slow);
10270
10362
  cb(Qcur, "Qcur", il);
10271
10363
 
@@ -10274,7 +10366,7 @@ struct llm_build_context {
10274
10366
 
10275
10367
  Kcur = ggml_rope_ext(
10276
10368
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10277
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10369
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10278
10370
  ext_factor, attn_factor, beta_fast, beta_slow);
10279
10371
  cb(Kcur, "Kcur", il);
10280
10372
 
@@ -10385,14 +10477,14 @@ struct llm_build_context {
10385
10477
 
10386
10478
  Qcur = ggml_rope_ext(
10387
10479
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10388
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10480
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10389
10481
  ext_factor, attn_factor, beta_fast, beta_slow
10390
10482
  );
10391
10483
  cb(Qcur, "Qcur", il);
10392
10484
 
10393
10485
  Kcur = ggml_rope_ext(
10394
10486
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10395
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10487
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10396
10488
  ext_factor, attn_factor, beta_fast, beta_slow
10397
10489
  );
10398
10490
  cb(Kcur, "Kcur", il);
@@ -10675,14 +10767,14 @@ struct llm_build_context {
10675
10767
 
10676
10768
  Qcur = ggml_rope_ext(
10677
10769
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10678
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10770
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10679
10771
  ext_factor, attn_factor, beta_fast, beta_slow
10680
10772
  );
10681
10773
  cb(Qcur, "Qcur", il);
10682
10774
 
10683
10775
  Kcur = ggml_rope_ext(
10684
10776
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10685
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10777
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10686
10778
  ext_factor, attn_factor, beta_fast, beta_slow
10687
10779
  );
10688
10780
  cb(Kcur, "Kcur", il);
@@ -10806,14 +10898,14 @@ struct llm_build_context {
10806
10898
 
10807
10899
  Qcur = ggml_rope_ext(
10808
10900
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10901
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10810
10902
  ext_factor, attn_factor, beta_fast, beta_slow
10811
10903
  );
10812
10904
  cb(Qcur, "Qcur", il);
10813
10905
 
10814
10906
  Kcur = ggml_rope_ext(
10815
10907
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10908
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10817
10909
  ext_factor, attn_factor, beta_fast, beta_slow
10818
10910
  );
10819
10911
  cb(Kcur, "Kcur", il);
@@ -10920,14 +11012,14 @@ struct llm_build_context {
10920
11012
 
10921
11013
  Qcur = ggml_rope_ext(
10922
11014
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11015
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10924
11016
  ext_factor, attn_factor, beta_fast, beta_slow
10925
11017
  );
10926
11018
  cb(Qcur, "Qcur", il);
10927
11019
 
10928
11020
  Kcur = ggml_rope_ext(
10929
11021
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11022
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10931
11023
  ext_factor, attn_factor, beta_fast, beta_slow
10932
11024
  );
10933
11025
  cb(Kcur, "Kcur", il);
@@ -11055,14 +11147,14 @@ struct llm_build_context {
11055
11147
 
11056
11148
  Qcur = ggml_rope_ext(
11057
11149
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
11058
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11150
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11059
11151
  ext_factor, attn_factor, beta_fast, beta_slow
11060
11152
  );
11061
11153
  cb(Qcur, "Qcur", il);
11062
11154
 
11063
11155
  Kcur = ggml_rope_ext(
11064
11156
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
11065
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11157
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11066
11158
  ext_factor, attn_factor, beta_fast, beta_slow
11067
11159
  );
11068
11160
  cb(Kcur, "Kcur", il);
@@ -11272,7 +11364,7 @@ struct llm_build_context {
11272
11364
  q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
11365
  q_pe = ggml_rope_ext(
11274
11366
  ctx0, q_pe, inp_pos, nullptr,
11275
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11367
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11276
11368
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
11369
  );
11278
11370
  cb(q_pe, "q_pe", il);
@@ -11281,7 +11373,7 @@ struct llm_build_context {
11281
11373
  k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
11374
  k_pe = ggml_rope_ext(
11283
11375
  ctx0, k_pe, inp_pos, nullptr,
11284
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11376
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11285
11377
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
11378
  );
11287
11379
  cb(k_pe, "k_pe", il);
@@ -11458,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
11458
11550
  if (batch.n_tokens < 32 || full_offload) {
11459
11551
  if (il != -1 && strcmp(name, "norm") == 0) {
11460
11552
  for (auto * backend : lctx.backends) {
11461
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11553
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11554
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11462
11555
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11463
11556
  break;
11464
11557
  }
@@ -11955,6 +12048,11 @@ static void llama_graph_compute(
11955
12048
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
11956
12049
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
11957
12050
  }
12051
+ #ifdef GGML_USE_BLAS
12052
+ if (lctx.backend_blas != nullptr) {
12053
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12054
+ }
12055
+ #endif
11958
12056
 
11959
12057
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11960
12058
 
@@ -12177,17 +12275,6 @@ static int llama_decode_internal(
12177
12275
  }
12178
12276
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12179
12277
 
12180
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12181
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12182
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12183
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12184
- // with the BLAS calls. need a better solution
12185
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12186
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12187
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12188
- n_threads = std::min(4, n_threads);
12189
- }
12190
-
12191
12278
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12192
12279
 
12193
12280
  llama_set_inputs(lctx, u_batch);
@@ -12616,27 +12703,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12616
12703
 
12617
12704
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12618
12705
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12619
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12706
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12620
12707
  }
12621
12708
 
12622
12709
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12623
12710
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12624
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12711
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12625
12712
  }
12626
12713
 
12627
12714
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12628
12715
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12629
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12716
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12630
12717
  }
12631
12718
 
12632
12719
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12633
12720
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12634
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12721
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12635
12722
  }
12636
12723
 
12637
12724
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12638
12725
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12639
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12726
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12640
12727
  }
12641
12728
 
12642
12729
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -12954,6 +13041,11 @@ struct llm_tokenizer_bpe {
12954
13041
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12955
13042
  });
12956
13043
  break;
13044
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
+ word_collection = unicode_regex_split(text, {
13046
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
+ });
13048
+ break;
12957
13049
  default:
12958
13050
  // default regex for BPE tokenization pre-processing
12959
13051
  word_collection = unicode_regex_split(text, {
@@ -13254,7 +13346,8 @@ struct fragment_buffer_variant {
13254
13346
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13255
13347
  // for each special token
13256
13348
  for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
- const auto & special_token = vocab.id_to_token[special_id].text;
13349
+ const auto & data = vocab.id_to_token[special_id];
13350
+ const auto & special_token = data.text;
13258
13351
 
13259
13352
  // for each text fragment
13260
13353
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13291,13 +13384,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13291
13384
  if (match > raw_text_base_offset) {
13292
13385
  // left
13293
13386
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
13294
- const int64_t left_reminder_length = match - raw_text_base_offset;
13295
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13387
+ int64_t left_reminder_length = match - raw_text_base_offset;
13388
+
13389
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13390
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13391
+ left_reminder_length--;
13392
+ }
13393
+ }
13394
+
13395
+ if (left_reminder_length > 0) {
13396
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13397
+ it++;
13398
+ }
13296
13399
 
13297
13400
  #ifdef PRETOKENIZERDEBUG
13298
13401
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13299
13402
  #endif
13300
- it++;
13301
13403
  }
13302
13404
 
13303
13405
  // special token
@@ -13306,16 +13408,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13306
13408
 
13307
13409
  // right
13308
13410
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13309
- const int64_t right_reminder_offset = match + special_token.length();
13310
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13311
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13411
+ int64_t right_reminder_offset = match + special_token.length();
13412
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13413
+
13414
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13415
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13416
+ right_reminder_offset++;
13417
+ right_reminder_length--;
13418
+ }
13419
+ }
13420
+
13421
+ if (right_reminder_length > 0) {
13422
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13423
+ it++;
13424
+ }
13312
13425
 
13313
13426
  #ifdef PRETOKENIZERDEBUG
13314
13427
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13315
13428
  #endif
13316
13429
 
13317
- it++;
13318
-
13319
13430
  if (source == 0) {
13320
13431
  buffer.erase_after(buffer.before_begin());
13321
13432
  } else {
@@ -13361,9 +13472,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13361
13472
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13362
13473
  // tokenizer.encode('', add_special_tokens=False) returns []
13363
13474
 
13364
- static const bool rtrim = true; //TODO: as param
13365
13475
  bool is_prev_special = false;
13366
- bool special_token_rtrim = false;
13367
13476
 
13368
13477
  if (add_special && vocab.special_add_bos != 0) {
13369
13478
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13373,25 +13482,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13373
13482
 
13374
13483
  for (const auto & fragment : fragment_buffer) {
13375
13484
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13376
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13377
-
13378
- // TODO: It's likely possible to get rid of this string copy entirely
13379
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13380
- // and passing 'add space prefix' as bool argument
13381
- //
13382
13485
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13383
13486
 
13384
- if (special_token_rtrim) {
13385
- size_t num_whitespaces = 0;
13386
- while (isspace(raw_text[num_whitespaces])) {
13387
- num_whitespaces++;
13388
- }
13389
- if (num_whitespaces == raw_text.size()) {
13390
- continue; // skip if all whitespaces
13391
- }
13392
- raw_text = raw_text.substr(num_whitespaces);
13393
- }
13394
-
13395
13487
  if (vocab.add_space_prefix) {
13396
13488
  if (!output.size() || is_prev_special) { // prefix with space if first token
13397
13489
  raw_text = " " + raw_text;
@@ -13407,11 +13499,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13407
13499
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13408
13500
  output.push_back(fragment.token);
13409
13501
  is_prev_special = true;
13410
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
- special_token_rtrim = rtrim
13412
- && fragment.token != vocab.special_bos_id
13413
- && fragment.token != vocab.special_unk_id
13414
- && fragment.token != vocab.special_eos_id;
13415
13502
  }
13416
13503
  }
13417
13504
 
@@ -13574,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13574
13661
  const uint32_t chr) {
13575
13662
 
13576
13663
  bool found = false;
13577
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13664
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13578
13665
 
13579
13666
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13580
13667
 
@@ -13583,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13583
13670
  // inclusive range, e.g. [a-z]
13584
13671
  found = found || (pos->value <= chr && chr <= pos[1].value);
13585
13672
  pos += 2;
13673
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13674
+ // Any character matches "."
13675
+ found = true;
13676
+ pos += 1;
13586
13677
  } else {
13587
13678
  // exact char match, e.g. [a] or "a"
13588
13679
  found = found || pos->value == chr;
@@ -13600,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
13600
13691
  const llama_grammar_element * pos,
13601
13692
  const llama_partial_utf8 partial_utf8) {
13602
13693
 
13603
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13694
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13604
13695
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13605
13696
 
13606
13697
  uint32_t partial_value = partial_utf8.value;
@@ -13630,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
13630
13721
  return is_positive_char;
13631
13722
  }
13632
13723
  pos += 2;
13724
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13725
+ // Any character matches "."
13726
+ return true;
13633
13727
  } else {
13634
13728
  // exact char match, e.g. [a] or "a"
13635
13729
  if (low <= pos->value && pos->value <= high) {
@@ -13690,6 +13784,7 @@ static void llama_grammar_advance_stack(
13690
13784
  }
13691
13785
  case LLAMA_GRETYPE_CHAR:
13692
13786
  case LLAMA_GRETYPE_CHAR_NOT:
13787
+ case LLAMA_GRETYPE_CHAR_ANY:
13693
13788
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13694
13789
  // only add the stack if it's not a duplicate of one we already have
13695
13790
  new_stacks.emplace_back(stack);
@@ -14646,260 +14741,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14646
14741
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14647
14742
  }
14648
14743
 
14649
- //
14650
- // Beam search
14651
- //
14652
-
14653
- struct llama_beam {
14654
- std::vector<llama_token> tokens;
14655
- float p; // Cumulative beam probability (renormalized relative to all beams)
14656
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14657
- // Sort beams by probability. In case of ties, prefer beams at eob.
14658
- bool operator<(const llama_beam & rhs) const {
14659
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14660
- }
14661
- // Shift off first n tokens and discard them.
14662
- void shift_tokens(const size_t n) {
14663
- if (n) {
14664
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14665
- tokens.resize(tokens.size() - n);
14666
- }
14667
- }
14668
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14669
- };
14670
-
14671
- // A struct for calculating logit-related info.
14672
- struct llama_logit_info {
14673
- const float * const logits;
14674
- const int n_vocab;
14675
- const float max_l;
14676
- const float normalizer;
14677
- struct sum_exp {
14678
- float max_l;
14679
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14680
- };
14681
- llama_logit_info(llama_context * ctx)
14682
- : logits(llama_get_logits(ctx))
14683
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14684
- , max_l(*std::max_element(logits, logits + n_vocab))
14685
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14686
- { }
14687
- llama_token_data get_token_data(const llama_token token_id) const {
14688
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14689
- return {token_id, logits[token_id], p};
14690
- }
14691
- // Return top k token_data by logit.
14692
- std::vector<llama_token_data> top_k(size_t k) {
14693
- std::vector<llama_token_data> min_heap; // min-heap by logit
14694
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14695
- min_heap.reserve(k_min);
14696
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14697
- min_heap.push_back(get_token_data(token_id));
14698
- }
14699
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14700
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14701
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14702
- if (min_heap.front().logit < logits[token_id]) {
14703
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14704
- min_heap.back().id = token_id;
14705
- min_heap.back().logit = logits[token_id];
14706
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14707
- }
14708
- }
14709
- return min_heap;
14710
- }
14711
- float probability_from_logit(float logit) const {
14712
- return normalizer * std::exp(logit - max_l);
14713
- }
14714
- };
14715
-
14716
- struct llama_beam_search_data {
14717
- llama_context * ctx;
14718
- size_t n_beams;
14719
- int n_past;
14720
- int n_predict;
14721
- std::vector<llama_beam> beams;
14722
- std::vector<llama_beam> next_beams;
14723
-
14724
- // Re-calculated on each loop iteration
14725
- size_t common_prefix_length;
14726
-
14727
- // Used to communicate to/from callback on beams state.
14728
- std::vector<llama_beam_view> beam_views;
14729
-
14730
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14731
- : ctx(ctx)
14732
- , n_beams(n_beams)
14733
- , n_past(n_past)
14734
- , n_predict(n_predict)
14735
- , beam_views(n_beams) {
14736
- beams.reserve(n_beams);
14737
- next_beams.reserve(n_beams);
14738
- }
14739
-
14740
- // Collapse beams to a single beam given by index.
14741
- void collapse_beams(const size_t beam_idx) {
14742
- if (0u < beam_idx) {
14743
- std::swap(beams[0], beams[beam_idx]);
14744
- }
14745
- beams.resize(1);
14746
- }
14747
-
14748
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14749
- // The repetitive patterns below reflect the 2 stages of heaps:
14750
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14751
- // * If the heap is full and a new element is found that should be included, pop the
14752
- // least element to the back(), replace it with the new, then push it into the heap.
14753
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14754
- // Min-heaps use a greater-than comparator.
14755
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14756
- if (beam.eob) {
14757
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14758
- if (next_beams.size() < n_beams) {
14759
- next_beams.push_back(std::move(beam));
14760
- if (next_beams.size() == n_beams) {
14761
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14762
- }
14763
- } else if (next_beams.front().p < beam.p) {
14764
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14765
- next_beams.back() = std::move(beam);
14766
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14767
- }
14768
- } else {
14769
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14770
- if (!beam.tokens.empty()) {
14771
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14772
- }
14773
- llama_logit_info logit_info(ctx);
14774
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14775
-
14776
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14777
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14778
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14779
-
14780
- size_t i=0;
14781
- if (next_beams.size() < n_beams) {
14782
- for (; next_beams.size() < n_beams ; ++i) {
14783
- llama_beam next_beam = beam;
14784
- next_beam.tokens.push_back(next_tokens[i].id);
14785
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14786
- next_beams.push_back(std::move(next_beam));
14787
- }
14788
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14789
- } else {
14790
- for (; next_beams.front().p == 0.0f ; ++i) {
14791
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14792
- next_beams.back() = beam;
14793
- next_beams.back().tokens.push_back(next_tokens[i].id);
14794
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14795
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14796
- }
14797
- }
14798
- for (; i < n_beams ; ++i) {
14799
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14800
- if (next_beams.front().p < next_p) {
14801
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14802
- next_beams.back() = beam;
14803
- next_beams.back().tokens.push_back(next_tokens[i].id);
14804
- next_beams.back().p = next_p;
14805
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14806
- }
14807
- }
14808
- }
14809
- }
14810
-
14811
- // Find common_prefix_length based on beams.
14812
- // Requires beams is not empty.
14813
- size_t find_common_prefix_length() {
14814
- size_t common_prefix_length = beams[0].tokens.size();
14815
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14816
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14817
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14818
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14819
- common_prefix_length = j;
14820
- break;
14821
- }
14822
- }
14823
- }
14824
- return common_prefix_length;
14825
- }
14826
-
14827
- // Construct beams_state to send back to caller via the callback function.
14828
- // Side effect: set common_prefix_length = find_common_prefix_length();
14829
- llama_beams_state get_beams_state(const bool last_call) {
14830
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14831
- beam_views[i] = beams[i].view();
14832
- }
14833
- common_prefix_length = find_common_prefix_length();
14834
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14835
- }
14836
-
14837
- // Loop:
14838
- // * while i < n_predict, AND
14839
- // * any of the beams have not yet reached end-of-beam (eob), AND
14840
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14841
- // (since all other beam probabilities can only decrease)
14842
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14843
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14844
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14845
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14846
- !beams[top_beam_index()].eob ; ++i) {
14847
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14848
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14849
- if (common_prefix_length) {
14850
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14851
- n_past += common_prefix_length;
14852
- }
14853
- // Zero-out next_beam probabilities to place them last in following min-heap.
14854
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14855
- for (llama_beam & beam : beams) {
14856
- beam.shift_tokens(common_prefix_length);
14857
- fill_next_beams_by_top_probabilities(beam);
14858
- }
14859
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14860
- beams.swap(next_beams);
14861
- renormalize_beam_probabilities(beams);
14862
- }
14863
- collapse_beams(top_beam_index());
14864
- callback(callback_data, get_beams_state(true));
14865
- }
14866
-
14867
- // As beams grow, the cumulative probabilities decrease.
14868
- // Renormalize them to avoid floating point underflow.
14869
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14870
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14871
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14872
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14873
- }
14874
-
14875
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14876
- size_t top_beam_index() {
14877
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14878
- }
14879
-
14880
- // Copy (p,eob) for each beam which may have been changed by the callback.
14881
- void update_beams_from_beam_views() {
14882
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14883
- beams[i].p = beam_views[i].p;
14884
- beams[i].eob = beam_views[i].eob;
14885
- }
14886
- }
14887
- };
14888
-
14889
- void llama_beam_search(llama_context * ctx,
14890
- llama_beam_search_callback_fn_t callback, void * callback_data,
14891
- size_t n_beams, int n_past, int n_predict) {
14892
- assert(ctx);
14893
- const int64_t t_start_sample_us = ggml_time_us();
14894
-
14895
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14896
-
14897
- beam_search_data.loop(callback, callback_data);
14898
-
14899
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14900
- ctx->n_sample++;
14901
- }
14902
-
14903
14744
  //
14904
14745
  // quantization
14905
14746
  //
@@ -15417,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15417
15258
  if (imatrix_data) {
15418
15259
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15419
15260
  qs.has_imatrix = true;
15261
+ // check imatrix for nans or infs
15262
+ for (const auto & kv : *imatrix_data) {
15263
+ for (float f : kv.second) {
15264
+ if (!std::isfinite(f)) {
15265
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15266
+ }
15267
+ }
15268
+ }
15420
15269
  }
15421
15270
  }
15422
15271
 
@@ -16110,7 +15959,7 @@ bool llama_supports_mlock(void) {
16110
15959
  }
16111
15960
 
16112
15961
  bool llama_supports_gpu_offload(void) {
16113
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15962
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
16114
15963
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
16115
15964
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
16116
15965
  return true;
@@ -16167,7 +16016,7 @@ struct llama_model * llama_load_model_from_file(
16167
16016
  return true;
16168
16017
  };
16169
16018
  }
16170
- if (params.rpc_servers != nullptr) {
16019
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
16171
16020
  // split the servers set them into model->rpc_servers
16172
16021
  std::string servers(params.rpc_servers);
16173
16022
  size_t pos = 0;
@@ -16221,6 +16070,11 @@ struct llama_context * llama_new_context_with_model(
16221
16070
  params.flash_attn = false;
16222
16071
  }
16223
16072
 
16073
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16074
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16075
+ return nullptr;
16076
+ }
16077
+
16224
16078
  llama_context * ctx = new llama_context(*model);
16225
16079
 
16226
16080
  const auto & hparams = model->hparams;
@@ -16259,8 +16113,8 @@ struct llama_context * llama_new_context_with_model(
16259
16113
 
16260
16114
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
16261
16115
 
16262
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16263
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16116
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16117
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
16264
16118
  hparams.n_ctx_train;
16265
16119
 
16266
16120
  cparams.cb_eval = params.cb_eval;
@@ -16325,17 +16179,7 @@ struct llama_context * llama_new_context_with_model(
16325
16179
 
16326
16180
  if (!hparams.vocab_only) {
16327
16181
  // initialize backends
16328
- #if defined(GGML_USE_RPC)
16329
- for (auto & server : model->rpc_servers) {
16330
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331
- if (backend == nullptr) {
16332
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333
- llama_free(ctx);
16334
- return nullptr;
16335
- }
16336
- ctx->backends.push_back(backend);
16337
- }
16338
- #elif defined(GGML_USE_METAL)
16182
+ #if defined(GGML_USE_METAL)
16339
16183
  if (model->n_gpu_layers > 0) {
16340
16184
  ctx->backend_metal = ggml_backend_metal_init();
16341
16185
  if (ctx->backend_metal == nullptr) {
@@ -16374,7 +16218,7 @@ struct llama_context * llama_new_context_with_model(
16374
16218
  return nullptr;
16375
16219
  }
16376
16220
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16377
- ggml_backend_t backend = ggml_backend_vk_init(0);
16221
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16378
16222
  if (backend == nullptr) {
16379
16223
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16380
16224
  llama_free(ctx);
@@ -16428,6 +16272,29 @@ struct llama_context * llama_new_context_with_model(
16428
16272
  ctx->backends.push_back(backend);
16429
16273
  }
16430
16274
  #endif
16275
+
16276
+ #ifdef GGML_USE_BLAS
16277
+ ctx->backend_blas = ggml_backend_blas_init();
16278
+ if (ctx->backend_blas == nullptr) {
16279
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16280
+ } else {
16281
+ ctx->backends.push_back(ctx->backend_blas);
16282
+ }
16283
+ #endif
16284
+
16285
+ #if defined(GGML_USE_RPC)
16286
+ if (model->n_gpu_layers > 0) {
16287
+ for (const auto & endpoint : model->rpc_servers) {
16288
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16289
+ if (backend == nullptr) {
16290
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16291
+ llama_free(ctx);
16292
+ return nullptr;
16293
+ }
16294
+ ctx->backends.push_back(backend);
16295
+ }
16296
+ }
16297
+ #endif
16431
16298
  ctx->backend_cpu = ggml_backend_cpu_init();
16432
16299
  if (ctx->backend_cpu == nullptr) {
16433
16300
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -18209,9 +18076,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
18209
18076
  return model->vocab.id_to_token[token].score;
18210
18077
  }
18211
18078
 
18212
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18079
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
18213
18080
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
18214
- return model->vocab.id_to_token[token].type;
18081
+ return model->vocab.id_to_token[token].attr;
18215
18082
  }
18216
18083
 
18217
18084
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -18313,9 +18180,14 @@ static std::string llama_decode_text(const std::string & text) {
18313
18180
 
18314
18181
  // does not write null-terminator to buf
18315
18182
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18183
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18184
+ if (!special && llama_is_control_token(model->vocab, token)) {
18185
+ return 0;
18186
+ }
18187
+
18316
18188
  // if we have a cache - use it
18317
18189
  {
18318
- const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18190
+ const auto & cache = model->vocab.cache_token_to_piece;
18319
18191
 
18320
18192
  if (!cache.empty()) {
18321
18193
  const auto & res = cache.at(token);