llama_cpp 0.15.4 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -23,6 +21,10 @@
23
21
  # include "ggml-kompute.h"
24
22
  #endif
25
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
26
28
  #ifdef GGML_USE_METAL
27
29
  # include "ggml-metal.h"
28
30
  #endif
@@ -110,7 +112,7 @@
110
112
  //
111
113
 
112
114
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
115
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
116
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
117
 
116
118
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -706,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
706
708
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
707
709
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
708
710
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
711
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
709
712
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
710
713
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
711
714
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1850,7 +1853,7 @@ struct llama_hparams {
1850
1853
  float rope_attn_factor = 1.0f;
1851
1854
  float rope_freq_base_train;
1852
1855
  float rope_freq_scale_train;
1853
- uint32_t n_yarn_orig_ctx;
1856
+ uint32_t n_ctx_orig_yarn;
1854
1857
  float rope_yarn_log_mul;
1855
1858
 
1856
1859
  // for State Space Models
@@ -1892,7 +1895,7 @@ struct llama_hparams {
1892
1895
  if (this->n_expert_shared != other.n_expert_shared) return true;
1893
1896
 
1894
1897
  if (this->rope_finetuned != other.rope_finetuned) return true;
1895
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1898
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1896
1899
 
1897
1900
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1898
1901
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1951,7 +1954,7 @@ struct llama_cparams {
1951
1954
  float rope_freq_base;
1952
1955
  float rope_freq_scale;
1953
1956
 
1954
- uint32_t n_yarn_orig_ctx;
1957
+ uint32_t n_ctx_orig_yarn;
1955
1958
  // These hyperparameters are not exposed in GGUF, because all
1956
1959
  // existing YaRN models use the same values for them.
1957
1960
  float yarn_ext_factor;
@@ -2149,12 +2152,12 @@ struct llama_control_vector {
2149
2152
  struct llama_vocab {
2150
2153
  using id = int32_t;
2151
2154
  using token = std::string;
2152
- using ttype = llama_token_type;
2155
+ using tattr = llama_token_attr;
2153
2156
 
2154
2157
  struct token_data {
2155
2158
  token text;
2156
2159
  float score;
2157
- ttype type;
2160
+ tattr attr;
2158
2161
  };
2159
2162
 
2160
2163
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2164,8 +2167,7 @@ struct llama_vocab {
2164
2167
  std::vector<token_data> id_to_token;
2165
2168
 
2166
2169
  std::vector<id> cache_special_tokens;
2167
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
- std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2170
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2169
2171
 
2170
2172
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2171
2173
 
@@ -2301,9 +2303,13 @@ struct llama_context {
2301
2303
  std::vector<ggml_backend_t> backends;
2302
2304
  #ifdef GGML_USE_METAL
2303
2305
  ggml_backend_t backend_metal = nullptr;
2306
+ #endif
2307
+ #ifdef GGML_USE_BLAS
2308
+ ggml_backend_t backend_blas = nullptr;
2304
2309
  #endif
2305
2310
  ggml_backend_t backend_cpu = nullptr;
2306
2311
 
2312
+
2307
2313
  const llama_model & model;
2308
2314
 
2309
2315
  // key + value cache for the self attention
@@ -2372,13 +2378,34 @@ struct llama_context {
2372
2378
  struct llama_control_vector cvec;
2373
2379
  };
2374
2380
 
2381
+ static size_t llama_get_device_count(const llama_model & model) {
2382
+ size_t count = 1;
2383
+ #if defined(GGML_USE_CUDA)
2384
+ count = ggml_backend_cuda_get_device_count();
2385
+ #elif defined(GGML_USE_SYCL)
2386
+ count = ggml_backend_sycl_get_device_count();
2387
+ #elif defined(GGML_USE_VULKAN)
2388
+ count = ggml_backend_vk_get_device_count();
2389
+ #endif
2390
+ #if defined(GGML_USE_RPC)
2391
+ count += model.rpc_servers.size();
2392
+ #endif
2393
+ return count;
2394
+ GGML_UNUSED(model);
2395
+ }
2396
+
2375
2397
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2376
2398
  ggml_backend_buffer_type_t buft = nullptr;
2377
2399
 
2378
- #ifdef GGML_USE_RPC
2379
- std::string endpoint = model.rpc_servers[gpu];
2380
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2381
- #elif defined(GGML_USE_METAL)
2400
+ #if defined(GGML_USE_RPC)
2401
+ int dev_count = (int)llama_get_device_count(model);
2402
+ int rpc_count = (int)model.rpc_servers.size();
2403
+ if (gpu >= dev_count - rpc_count) {
2404
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2405
+ return ggml_backend_rpc_buffer_type(endpoint);
2406
+ }
2407
+ #endif
2408
+ #if defined(GGML_USE_METAL)
2382
2409
  buft = ggml_backend_metal_buffer_type();
2383
2410
  #elif defined(GGML_USE_CUDA)
2384
2411
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2386,8 +2413,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2386
2413
  buft = ggml_backend_vk_buffer_type(gpu);
2387
2414
  #elif defined(GGML_USE_SYCL)
2388
2415
  buft = ggml_backend_sycl_buffer_type(gpu);
2389
- #elif defined(GGML_USE_CLBLAST)
2390
- buft = ggml_backend_opencl_buffer_type();
2391
2416
  #elif defined(GGML_USE_KOMPUTE)
2392
2417
  buft = ggml_backend_kompute_buffer_type(gpu);
2393
2418
  if (buft == nullptr) {
@@ -2426,29 +2451,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2426
2451
  GGML_UNUSED(tensor_split);
2427
2452
  }
2428
2453
 
2429
- static size_t llama_get_device_count(const llama_model & model) {
2430
- #if defined(GGML_USE_RPC)
2431
- return model.rpc_servers.size();
2432
- #elif defined(GGML_USE_CUDA)
2433
- return ggml_backend_cuda_get_device_count();
2434
- #elif defined(GGML_USE_SYCL)
2435
- return ggml_backend_sycl_get_device_count();
2436
- #elif defined(GGML_USE_VULKAN)
2437
- return ggml_backend_vk_get_device_count();
2438
- #else
2439
- return 1;
2440
- #endif
2441
- GGML_UNUSED(model);
2442
- }
2443
-
2444
2454
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2445
2455
  #if defined(GGML_USE_RPC)
2446
- size_t total;
2447
- size_t free;
2448
- std::string endpoint = model.rpc_servers[device];
2449
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2450
- return free;
2451
- #elif defined(GGML_USE_CUDA)
2456
+ int dev_count = (int)llama_get_device_count(model);
2457
+ int rpc_count = (int)model.rpc_servers.size();
2458
+ if (device >= dev_count - rpc_count) {
2459
+ size_t total;
2460
+ size_t free;
2461
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2462
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2463
+ return free;
2464
+ }
2465
+ #endif
2466
+ #if defined(GGML_USE_CUDA)
2452
2467
  size_t total;
2453
2468
  size_t free;
2454
2469
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2520,10 +2535,6 @@ static bool llama_kv_cache_init(
2520
2535
  }
2521
2536
  }
2522
2537
 
2523
- #ifdef GGML_USE_CLBLAST
2524
- offload = false;
2525
- #endif
2526
-
2527
2538
  // count used buffer types
2528
2539
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2529
2540
  if (offload) {
@@ -4003,8 +4014,8 @@ static void llm_load_hparams(
4003
4014
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
4004
4015
  hparams.rope_finetuned = rope_finetuned;
4005
4016
 
4006
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
4007
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4017
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4018
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
4008
4019
 
4009
4020
  // rope_freq_base (optional)
4010
4021
  hparams.rope_freq_base_train = 10000.0f;
@@ -4550,35 +4561,6 @@ static void llm_load_vocab(
4550
4561
  vocab.special_cls_id = -1;
4551
4562
  vocab.special_mask_id = -1;
4552
4563
 
4553
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4554
- // prior to support of FIM special tokens in GGUF, the following
4555
- // will allow those models to continue to work. The general names
4556
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4557
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4558
- // new versions of these models have been published.
4559
- std::string gen_name;
4560
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4561
-
4562
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4563
- [](unsigned char c){ return std::tolower(c); });
4564
-
4565
- if (gen_name.find("code") != std::string::npos) {
4566
- if (model.arch == LLM_ARCH_LLAMA) {
4567
- vocab.special_prefix_id = 32007;
4568
- vocab.special_suffix_id = 32008;
4569
- vocab.special_middle_id = 32009;
4570
- vocab.special_eot_id = 32010;
4571
- } else if (model.arch == LLM_ARCH_GEMMA) {
4572
- vocab.special_prefix_id = 67;
4573
- vocab.special_suffix_id = 69;
4574
- vocab.special_middle_id = 68;
4575
- // TODO: this is not EOT, it is "file separator" token, needs fix
4576
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4577
- //vocab.special_eot_id = 70;
4578
- vocab.special_eot_id = 107;
4579
- }
4580
- }
4581
-
4582
4564
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4583
4565
  if (add_space_prefix_keyidx != -1) {
4584
4566
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4651,8 +4633,7 @@ static void llm_load_vocab(
4651
4633
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4652
4634
  LLAMA_LOG_WARN("%s: \n", __func__);
4653
4635
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4654
- } else if (
4655
- tokenizer_pre == "default") {
4636
+ } else if (tokenizer_pre == "default") {
4656
4637
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4657
4638
  } else if (
4658
4639
  tokenizer_pre == "llama3" ||
@@ -4679,7 +4660,8 @@ static void llm_load_vocab(
4679
4660
  tokenizer_pre == "jina-es" ||
4680
4661
  tokenizer_pre == "jina-de" ||
4681
4662
  tokenizer_pre == "jina-v2-es" ||
4682
- tokenizer_pre == "jina-v2-de") {
4663
+ tokenizer_pre == "jina-v2-de" ||
4664
+ tokenizer_pre == "jina-v2-code") {
4683
4665
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4684
4666
  } else if (
4685
4667
  tokenizer_pre == "refact") {
@@ -4702,6 +4684,9 @@ static void llm_load_vocab(
4702
4684
  } else if (
4703
4685
  tokenizer_pre == "smaug-bpe") {
4704
4686
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4687
+ } else if (
4688
+ tokenizer_pre == "poro-chat") {
4689
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4705
4690
  } else {
4706
4691
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4707
4692
  }
@@ -4740,12 +4725,64 @@ static void llm_load_vocab(
4740
4725
  auto & token_data = vocab.id_to_token[i];
4741
4726
  token_data.text = std::move(word);
4742
4727
  token_data.score = scores ? scores[i] : 0.0f;
4743
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4728
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4729
+
4730
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4731
+ switch(toktypes[i]) {
4732
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4733
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4734
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4735
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4736
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4737
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4738
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4739
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4740
+ }
4741
+ }
4744
4742
  }
4745
4743
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4746
4744
 
4747
4745
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4748
4746
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4748
+ // prior to support of FIM special tokens in GGUF, the following
4749
+ // will allow those models to continue to work. The general names
4750
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752
+ // new versions of these models have been published.
4753
+ std::string gen_name;
4754
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755
+
4756
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757
+ [](unsigned char c){ return std::tolower(c); });
4758
+
4759
+ if (gen_name.find("code") != std::string::npos) {
4760
+ if (model.arch == LLM_ARCH_LLAMA
4761
+ && 32010 < vocab.id_to_token.size()
4762
+ && vocab.id_to_token[32007].text == "<PRE>"
4763
+ && vocab.id_to_token[32008].text == "<SUF>"
4764
+ && vocab.id_to_token[32009].text == "<MID>"
4765
+ && vocab.id_to_token[32010].text == "<EOT>") {
4766
+ vocab.special_prefix_id = 32007;
4767
+ vocab.special_suffix_id = 32008;
4768
+ vocab.special_middle_id = 32009;
4769
+ vocab.special_eot_id = 32010;
4770
+ } else if (model.arch == LLM_ARCH_GEMMA
4771
+ && 107 < vocab.id_to_token.size()
4772
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4773
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4774
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4775
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4776
+ vocab.special_prefix_id = 67;
4777
+ vocab.special_suffix_id = 69;
4778
+ vocab.special_middle_id = 68;
4779
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4780
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781
+ //vocab.special_eot_id = 70;
4782
+ vocab.special_eot_id = 107;
4783
+ }
4784
+ }
4785
+
4749
4786
  try {
4750
4787
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4751
4788
  } catch (const std::exception & e) {
@@ -4831,7 +4868,7 @@ static void llm_load_vocab(
4831
4868
  // build special tokens cache
4832
4869
  {
4833
4870
  for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4834
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4871
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4835
4872
  vocab.cache_special_tokens.push_back(id);
4836
4873
  }
4837
4874
  }
@@ -4845,26 +4882,75 @@ static void llm_load_vocab(
4845
4882
  LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
4883
  }
4847
4884
 
4848
- // build token to piece caches
4885
+ // build token to piece cache
4849
4886
  {
4850
4887
  size_t size_cache = 0;
4851
4888
 
4852
- std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
- std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4889
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4854
4890
 
4855
4891
  for (uint32_t id = 0; id < n_vocab; ++id) {
4856
- cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
- cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4892
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4858
4893
 
4859
4894
  size_cache += cache_token_to_piece[id].size();
4860
- size_cache += cache_token_to_piece_special[id].size();
4861
4895
  }
4862
4896
 
4863
- std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
- std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4897
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4865
4898
 
4866
4899
  LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4867
4900
  }
4901
+
4902
+ // Handle per token attributes
4903
+ //NOTE: Each model customizes per token attributes.
4904
+ //NOTE: Per token attributes are missing from the GGUF file.
4905
+ //TODO: Extract attributes from GGUF file.
4906
+ {
4907
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4908
+ for (auto substr : substrs) {
4909
+ if (str.find(substr) < std::string::npos) {
4910
+ return true;
4911
+ }
4912
+ }
4913
+ return false;
4914
+ };
4915
+
4916
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4917
+ uint32_t current = vocab.id_to_token.at(id).attr;
4918
+ current = value ? (current | attr) : (current & ~attr);
4919
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4920
+ };
4921
+
4922
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4923
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4924
+ };
4925
+
4926
+ std::string model_name;
4927
+ std::string tokenizer_pre;
4928
+
4929
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4930
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4931
+
4932
+ // model name to lowercase
4933
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4934
+ [] (const std::string::value_type x) {
4935
+ return std::tolower(x);
4936
+ }
4937
+ );
4938
+
4939
+ // set attributes by model/tokenizer name
4940
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4941
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4942
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4943
+ for (auto id : vocab.cache_special_tokens) {
4944
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4945
+ }
4946
+ for (auto token : {"</s>"}) {
4947
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4948
+ }
4949
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4950
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4951
+ }
4952
+ }
4953
+ }
4868
4954
  }
4869
4955
 
4870
4956
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -4904,7 +4990,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4904
4990
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4905
4991
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4906
4992
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4907
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4993
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4908
4994
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4909
4995
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4910
4996
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -5129,12 +5215,10 @@ static bool llm_load_tensors(
5129
5215
  // output
5130
5216
  {
5131
5217
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5132
- if (model.arch != LLM_ARCH_MINICPM){
5133
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5134
- // if output is NULL, init from the input tok embed
5135
- if (model.output == NULL) {
5136
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5137
- }
5218
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5219
+ // if output is NULL, init from the input tok embed
5220
+ if (model.output == NULL) {
5221
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5138
5222
  }
5139
5223
  }
5140
5224
 
@@ -5453,7 +5537,7 @@ static bool llm_load_tensors(
5453
5537
 
5454
5538
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5455
5539
  } else {
5456
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5540
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5457
5541
  }
5458
5542
 
5459
5543
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5494,6 +5578,9 @@ static bool llm_load_tensors(
5494
5578
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5495
5579
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5496
5580
 
5581
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5582
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5583
+
5497
5584
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5498
5585
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5499
5586
 
@@ -7072,7 +7159,7 @@ struct llm_build_context {
7072
7159
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
7073
7160
  const int32_t n_outputs;
7074
7161
  const int32_t kv_head; // index of where we store new KV data in the cache
7075
- const int32_t n_orig_ctx;
7162
+ const int32_t n_ctx_orig;
7076
7163
 
7077
7164
  const bool flash_attn;
7078
7165
 
@@ -7121,7 +7208,7 @@ struct llm_build_context {
7121
7208
  n_kv (worst_case ? kv_self.size : kv_self.n),
7122
7209
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
7123
7210
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
7124
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7211
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
7125
7212
  flash_attn (cparams.flash_attn),
7126
7213
  pooling_type (cparams.pooling_type),
7127
7214
  rope_type (hparams.rope_type),
@@ -7179,7 +7266,7 @@ struct llm_build_context {
7179
7266
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7180
7267
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7181
7268
  0),
7182
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7269
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7183
7270
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
7271
 
7185
7272
  cb(tmp, "K_shifted", il);
@@ -7288,7 +7375,7 @@ struct llm_build_context {
7288
7375
  // choose long/short freq factors based on the context size
7289
7376
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
7377
 
7291
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7378
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7292
7379
  return model.layers[il].rope_long;
7293
7380
  }
7294
7381
 
@@ -7404,14 +7491,14 @@ struct llm_build_context {
7404
7491
 
7405
7492
  Qcur = ggml_rope_ext(
7406
7493
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7494
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7495
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7496
  );
7410
7497
  cb(Qcur, "Qcur", il);
7411
7498
 
7412
7499
  Kcur = ggml_rope_ext(
7413
7500
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7414
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7501
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7415
7502
  ext_factor, attn_factor, beta_fast, beta_slow
7416
7503
  );
7417
7504
  cb(Kcur, "Kcur", il);
@@ -7535,12 +7622,12 @@ struct llm_build_context {
7535
7622
  case MODEL_7B:
7536
7623
  Qcur = ggml_rope_ext(
7537
7624
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7538
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7625
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7539
7626
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7627
  );
7541
7628
  Kcur = ggml_rope_ext(
7542
7629
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7543
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7630
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7544
7631
  ext_factor, attn_factor, beta_fast, beta_slow
7545
7632
  );
7546
7633
  break;
@@ -7647,14 +7734,14 @@ struct llm_build_context {
7647
7734
 
7648
7735
  Qcur = ggml_rope_ext(
7649
7736
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7650
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7737
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7651
7738
  ext_factor, attn_factor, beta_fast, beta_slow
7652
7739
  );
7653
7740
  cb(Qcur, "Qcur", il);
7654
7741
 
7655
7742
  Kcur = ggml_rope_ext(
7656
7743
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7657
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7744
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7658
7745
  ext_factor, attn_factor, beta_fast, beta_slow
7659
7746
  );
7660
7747
  cb(Kcur, "Kcur", il);
@@ -7767,13 +7854,13 @@ struct llm_build_context {
7767
7854
 
7768
7855
  // using mode = 2 for neox mode
7769
7856
  Qcur = ggml_rope_ext(
7770
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7857
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7771
7858
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7772
7859
  );
7773
7860
  cb(Qcur, "Qcur", il);
7774
7861
 
7775
7862
  Kcur = ggml_rope_ext(
7776
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7863
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7777
7864
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7778
7865
  );
7779
7866
  cb(Kcur, "Kcur", il);
@@ -7891,14 +7978,14 @@ struct llm_build_context {
7891
7978
 
7892
7979
  Qcur = ggml_rope_ext(
7893
7980
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7894
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7981
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7895
7982
  ext_factor, attn_factor, beta_fast, beta_slow
7896
7983
  );
7897
7984
  cb(Qcur, "Qcur", il);
7898
7985
 
7899
7986
  Kcur = ggml_rope_ext(
7900
7987
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7988
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7902
7989
  ext_factor, attn_factor, beta_fast, beta_slow
7903
7990
  );
7904
7991
  cb(Kcur, "Kcur", il);
@@ -8044,14 +8131,14 @@ struct llm_build_context {
8044
8131
 
8045
8132
  Qcur = ggml_rope_ext(
8046
8133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8047
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8048
8135
  ext_factor, attn_factor, beta_fast, beta_slow
8049
8136
  );
8050
8137
  cb(Qcur, "Qcur", il);
8051
8138
 
8052
8139
  Kcur = ggml_rope_ext(
8053
8140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8054
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8055
8142
  ext_factor, attn_factor, beta_fast, beta_slow
8056
8143
  );
8057
8144
  cb(Kcur, "Kcur", il);
@@ -8398,14 +8485,14 @@ struct llm_build_context {
8398
8485
 
8399
8486
  Qcur = ggml_rope_ext(
8400
8487
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8401
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8488
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8402
8489
  ext_factor, attn_factor, beta_fast, beta_slow
8403
8490
  );
8404
8491
  cb(Qcur, "Qcur", il);
8405
8492
 
8406
8493
  Kcur = ggml_rope_ext(
8407
8494
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8408
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8495
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8409
8496
  ext_factor, attn_factor, beta_fast, beta_slow
8410
8497
  );
8411
8498
  cb(Kcur, "Kcur", il);
@@ -8457,6 +8544,11 @@ struct llm_build_context {
8457
8544
  // attention layer norm
8458
8545
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8459
8546
 
8547
+ if (model.layers[il].attn_norm_2 != nullptr) {
8548
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8549
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8550
+ }
8551
+
8460
8552
  struct ggml_tensor * ffn_inp = cur;
8461
8553
  cb(ffn_inp, "ffn_inp", il);
8462
8554
 
@@ -8838,14 +8930,14 @@ struct llm_build_context {
8838
8930
 
8839
8931
  Qcur = ggml_rope_ext(
8840
8932
  ctx0, Qcur, inp_pos, nullptr,
8841
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8933
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8842
8934
  ext_factor, attn_factor, beta_fast, beta_slow
8843
8935
  );
8844
8936
  cb(Qcur, "Qcur", il);
8845
8937
 
8846
8938
  Kcur = ggml_rope_ext(
8847
8939
  ctx0, Kcur, inp_pos, nullptr,
8848
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8940
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8849
8941
  ext_factor, attn_factor, beta_fast, beta_slow
8850
8942
  );
8851
8943
  cb(Kcur, "Kcur", il);
@@ -8957,13 +9049,13 @@ struct llm_build_context {
8957
9049
 
8958
9050
  // using mode = 2 for neox mode
8959
9051
  Qcur = ggml_rope_ext(
8960
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9052
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8961
9053
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8962
9054
  );
8963
9055
  cb(Qcur, "Qcur", il);
8964
9056
 
8965
9057
  Kcur = ggml_rope_ext(
8966
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9058
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8967
9059
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8968
9060
  );
8969
9061
  cb(Kcur, "Kcur", il);
@@ -9069,14 +9161,14 @@ struct llm_build_context {
9069
9161
 
9070
9162
  Qcur = ggml_rope_ext(
9071
9163
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9072
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9164
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9073
9165
  ext_factor, attn_factor, beta_fast, beta_slow
9074
9166
  );
9075
9167
  cb(Qcur, "Qcur", il);
9076
9168
 
9077
9169
  Kcur = ggml_rope_ext(
9078
9170
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9079
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9171
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9080
9172
  ext_factor, attn_factor, beta_fast, beta_slow
9081
9173
  );
9082
9174
  cb(Kcur, "Kcur", il);
@@ -9183,14 +9275,14 @@ struct llm_build_context {
9183
9275
 
9184
9276
  Qcur = ggml_rope_ext(
9185
9277
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9186
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9278
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9187
9279
  ext_factor, attn_factor, beta_fast, beta_slow
9188
9280
  );
9189
9281
  cb(Qcur, "Qcur", il);
9190
9282
 
9191
9283
  Kcur = ggml_rope_ext(
9192
9284
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9193
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9285
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9194
9286
  ext_factor, attn_factor, beta_fast, beta_slow
9195
9287
  );
9196
9288
  cb(Kcur, "Kcur", il);
@@ -9335,7 +9427,7 @@ struct llm_build_context {
9335
9427
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9336
9428
 
9337
9429
  Qcur = ggml_rope_ext(
9338
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9430
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9339
9431
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9340
9432
  );
9341
9433
  cb(Qcur, "Qcur", il);
@@ -9346,7 +9438,7 @@ struct llm_build_context {
9346
9438
  cb(Qcur, "Qcur", il);
9347
9439
 
9348
9440
  Kcur = ggml_rope_ext(
9349
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9441
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9350
9442
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9351
9443
  );
9352
9444
  cb(Kcur, "Kcur", il);
@@ -9457,7 +9549,7 @@ struct llm_build_context {
9457
9549
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9458
9550
 
9459
9551
  Qcur = ggml_rope_ext(
9460
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9552
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9461
9553
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9462
9554
  );
9463
9555
  cb(Qcur, "Qcur", il);
@@ -9466,7 +9558,7 @@ struct llm_build_context {
9466
9558
  cb(Qcur, "Qcur", il);
9467
9559
 
9468
9560
  Kcur = ggml_rope_ext(
9469
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9561
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9470
9562
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9471
9563
  );
9472
9564
  cb(Kcur, "Kcur", il);
@@ -9574,13 +9666,13 @@ struct llm_build_context {
9574
9666
 
9575
9667
  Qcur = ggml_rope_ext(
9576
9668
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9577
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9669
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9578
9670
  ext_factor, attn_factor, beta_fast, beta_slow);
9579
9671
  cb(Qcur, "Qcur", il);
9580
9672
 
9581
9673
  Kcur = ggml_rope_ext(
9582
9674
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9583
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9675
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9584
9676
  ext_factor, attn_factor, beta_fast, beta_slow);
9585
9677
  cb(Kcur, "Kcur", il);
9586
9678
 
@@ -9782,14 +9874,14 @@ struct llm_build_context {
9782
9874
 
9783
9875
  struct ggml_tensor * Qcur = ggml_rope_ext(
9784
9876
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9785
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9877
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9786
9878
  ext_factor, attn_factor, beta_fast, beta_slow
9787
9879
  );
9788
9880
  cb(Qcur, "Qcur", il);
9789
9881
 
9790
9882
  struct ggml_tensor * Kcur = ggml_rope_ext(
9791
9883
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9792
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9884
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9793
9885
  ext_factor, attn_factor, beta_fast, beta_slow
9794
9886
  );
9795
9887
  cb(Kcur, "Kcur", il);
@@ -9898,14 +9990,14 @@ struct llm_build_context {
9898
9990
 
9899
9991
  Qcur = ggml_rope_ext(
9900
9992
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9993
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9902
9994
  ext_factor, attn_factor, beta_fast, beta_slow
9903
9995
  );
9904
9996
  cb(Qcur, "Qcur", il);
9905
9997
 
9906
9998
  Kcur = ggml_rope_ext(
9907
9999
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9908
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10000
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9909
10001
  ext_factor, attn_factor, beta_fast, beta_slow
9910
10002
  );
9911
10003
  cb(Kcur, "Kcur", il);
@@ -10015,14 +10107,14 @@ struct llm_build_context {
10015
10107
 
10016
10108
  Qcur = ggml_rope_ext(
10017
10109
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10018
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10110
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10019
10111
  ext_factor, attn_factor, beta_fast, beta_slow
10020
10112
  );
10021
10113
  cb(Qcur, "Qcur", il);
10022
10114
 
10023
10115
  Kcur = ggml_rope_ext(
10024
10116
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10025
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10117
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10026
10118
  ext_factor, attn_factor, beta_fast, beta_slow
10027
10119
  );
10028
10120
  cb(Kcur, "Kcur", il);
@@ -10145,14 +10237,14 @@ struct llm_build_context {
10145
10237
 
10146
10238
  Qcur = ggml_rope_ext(
10147
10239
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10148
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10240
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10149
10241
  ext_factor, attn_factor, beta_fast, beta_slow
10150
10242
  );
10151
10243
  cb(Qcur, "Qcur", il);
10152
10244
 
10153
10245
  Kcur = ggml_rope_ext(
10154
10246
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10155
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10247
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10156
10248
  ext_factor, attn_factor, beta_fast, beta_slow
10157
10249
  );
10158
10250
  cb(Kcur, "Kcur", il);
@@ -10217,7 +10309,7 @@ struct llm_build_context {
10217
10309
  cb(cur, "lmhead_scaling", -1);
10218
10310
 
10219
10311
  // lm_head
10220
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10312
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10221
10313
  cb(cur, "result_output", -1);
10222
10314
 
10223
10315
  ggml_build_forward_expand(gf, cur);
@@ -10265,7 +10357,7 @@ struct llm_build_context {
10265
10357
 
10266
10358
  Qcur = ggml_rope_ext(
10267
10359
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10268
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10360
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10269
10361
  ext_factor, attn_factor, beta_fast, beta_slow);
10270
10362
  cb(Qcur, "Qcur", il);
10271
10363
 
@@ -10274,7 +10366,7 @@ struct llm_build_context {
10274
10366
 
10275
10367
  Kcur = ggml_rope_ext(
10276
10368
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10277
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10369
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10278
10370
  ext_factor, attn_factor, beta_fast, beta_slow);
10279
10371
  cb(Kcur, "Kcur", il);
10280
10372
 
@@ -10385,14 +10477,14 @@ struct llm_build_context {
10385
10477
 
10386
10478
  Qcur = ggml_rope_ext(
10387
10479
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10388
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10480
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10389
10481
  ext_factor, attn_factor, beta_fast, beta_slow
10390
10482
  );
10391
10483
  cb(Qcur, "Qcur", il);
10392
10484
 
10393
10485
  Kcur = ggml_rope_ext(
10394
10486
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10395
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10487
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10396
10488
  ext_factor, attn_factor, beta_fast, beta_slow
10397
10489
  );
10398
10490
  cb(Kcur, "Kcur", il);
@@ -10675,14 +10767,14 @@ struct llm_build_context {
10675
10767
 
10676
10768
  Qcur = ggml_rope_ext(
10677
10769
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10678
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10770
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10679
10771
  ext_factor, attn_factor, beta_fast, beta_slow
10680
10772
  );
10681
10773
  cb(Qcur, "Qcur", il);
10682
10774
 
10683
10775
  Kcur = ggml_rope_ext(
10684
10776
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10685
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10777
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10686
10778
  ext_factor, attn_factor, beta_fast, beta_slow
10687
10779
  );
10688
10780
  cb(Kcur, "Kcur", il);
@@ -10806,14 +10898,14 @@ struct llm_build_context {
10806
10898
 
10807
10899
  Qcur = ggml_rope_ext(
10808
10900
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10901
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10810
10902
  ext_factor, attn_factor, beta_fast, beta_slow
10811
10903
  );
10812
10904
  cb(Qcur, "Qcur", il);
10813
10905
 
10814
10906
  Kcur = ggml_rope_ext(
10815
10907
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10908
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10817
10909
  ext_factor, attn_factor, beta_fast, beta_slow
10818
10910
  );
10819
10911
  cb(Kcur, "Kcur", il);
@@ -10920,14 +11012,14 @@ struct llm_build_context {
10920
11012
 
10921
11013
  Qcur = ggml_rope_ext(
10922
11014
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11015
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10924
11016
  ext_factor, attn_factor, beta_fast, beta_slow
10925
11017
  );
10926
11018
  cb(Qcur, "Qcur", il);
10927
11019
 
10928
11020
  Kcur = ggml_rope_ext(
10929
11021
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11022
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10931
11023
  ext_factor, attn_factor, beta_fast, beta_slow
10932
11024
  );
10933
11025
  cb(Kcur, "Kcur", il);
@@ -11055,14 +11147,14 @@ struct llm_build_context {
11055
11147
 
11056
11148
  Qcur = ggml_rope_ext(
11057
11149
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
11058
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11150
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11059
11151
  ext_factor, attn_factor, beta_fast, beta_slow
11060
11152
  );
11061
11153
  cb(Qcur, "Qcur", il);
11062
11154
 
11063
11155
  Kcur = ggml_rope_ext(
11064
11156
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
11065
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11157
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11066
11158
  ext_factor, attn_factor, beta_fast, beta_slow
11067
11159
  );
11068
11160
  cb(Kcur, "Kcur", il);
@@ -11272,7 +11364,7 @@ struct llm_build_context {
11272
11364
  q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
11365
  q_pe = ggml_rope_ext(
11274
11366
  ctx0, q_pe, inp_pos, nullptr,
11275
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11367
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11276
11368
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
11369
  );
11278
11370
  cb(q_pe, "q_pe", il);
@@ -11281,7 +11373,7 @@ struct llm_build_context {
11281
11373
  k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
11374
  k_pe = ggml_rope_ext(
11283
11375
  ctx0, k_pe, inp_pos, nullptr,
11284
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11376
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11285
11377
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
11378
  );
11287
11379
  cb(k_pe, "k_pe", il);
@@ -11458,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
11458
11550
  if (batch.n_tokens < 32 || full_offload) {
11459
11551
  if (il != -1 && strcmp(name, "norm") == 0) {
11460
11552
  for (auto * backend : lctx.backends) {
11461
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11553
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11554
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11462
11555
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11463
11556
  break;
11464
11557
  }
@@ -11955,6 +12048,11 @@ static void llama_graph_compute(
11955
12048
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
11956
12049
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
11957
12050
  }
12051
+ #ifdef GGML_USE_BLAS
12052
+ if (lctx.backend_blas != nullptr) {
12053
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12054
+ }
12055
+ #endif
11958
12056
 
11959
12057
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11960
12058
 
@@ -12177,17 +12275,6 @@ static int llama_decode_internal(
12177
12275
  }
12178
12276
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12179
12277
 
12180
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12181
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12182
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12183
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12184
- // with the BLAS calls. need a better solution
12185
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12186
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12187
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12188
- n_threads = std::min(4, n_threads);
12189
- }
12190
-
12191
12278
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12192
12279
 
12193
12280
  llama_set_inputs(lctx, u_batch);
@@ -12616,27 +12703,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12616
12703
 
12617
12704
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12618
12705
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12619
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12706
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12620
12707
  }
12621
12708
 
12622
12709
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12623
12710
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12624
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12711
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12625
12712
  }
12626
12713
 
12627
12714
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12628
12715
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12629
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12716
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12630
12717
  }
12631
12718
 
12632
12719
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12633
12720
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12634
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12721
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12635
12722
  }
12636
12723
 
12637
12724
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12638
12725
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12639
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12726
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12640
12727
  }
12641
12728
 
12642
12729
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -12954,6 +13041,11 @@ struct llm_tokenizer_bpe {
12954
13041
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12955
13042
  });
12956
13043
  break;
13044
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
+ word_collection = unicode_regex_split(text, {
13046
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
+ });
13048
+ break;
12957
13049
  default:
12958
13050
  // default regex for BPE tokenization pre-processing
12959
13051
  word_collection = unicode_regex_split(text, {
@@ -13254,7 +13346,8 @@ struct fragment_buffer_variant {
13254
13346
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13255
13347
  // for each special token
13256
13348
  for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
- const auto & special_token = vocab.id_to_token[special_id].text;
13349
+ const auto & data = vocab.id_to_token[special_id];
13350
+ const auto & special_token = data.text;
13258
13351
 
13259
13352
  // for each text fragment
13260
13353
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13291,13 +13384,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13291
13384
  if (match > raw_text_base_offset) {
13292
13385
  // left
13293
13386
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
13294
- const int64_t left_reminder_length = match - raw_text_base_offset;
13295
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13387
+ int64_t left_reminder_length = match - raw_text_base_offset;
13388
+
13389
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13390
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13391
+ left_reminder_length--;
13392
+ }
13393
+ }
13394
+
13395
+ if (left_reminder_length > 0) {
13396
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13397
+ it++;
13398
+ }
13296
13399
 
13297
13400
  #ifdef PRETOKENIZERDEBUG
13298
13401
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13299
13402
  #endif
13300
- it++;
13301
13403
  }
13302
13404
 
13303
13405
  // special token
@@ -13306,16 +13408,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13306
13408
 
13307
13409
  // right
13308
13410
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13309
- const int64_t right_reminder_offset = match + special_token.length();
13310
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13311
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13411
+ int64_t right_reminder_offset = match + special_token.length();
13412
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13413
+
13414
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13415
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13416
+ right_reminder_offset++;
13417
+ right_reminder_length--;
13418
+ }
13419
+ }
13420
+
13421
+ if (right_reminder_length > 0) {
13422
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13423
+ it++;
13424
+ }
13312
13425
 
13313
13426
  #ifdef PRETOKENIZERDEBUG
13314
13427
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13315
13428
  #endif
13316
13429
 
13317
- it++;
13318
-
13319
13430
  if (source == 0) {
13320
13431
  buffer.erase_after(buffer.before_begin());
13321
13432
  } else {
@@ -13361,9 +13472,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13361
13472
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13362
13473
  // tokenizer.encode('', add_special_tokens=False) returns []
13363
13474
 
13364
- static const bool rtrim = true; //TODO: as param
13365
13475
  bool is_prev_special = false;
13366
- bool special_token_rtrim = false;
13367
13476
 
13368
13477
  if (add_special && vocab.special_add_bos != 0) {
13369
13478
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13373,25 +13482,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13373
13482
 
13374
13483
  for (const auto & fragment : fragment_buffer) {
13375
13484
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13376
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13377
-
13378
- // TODO: It's likely possible to get rid of this string copy entirely
13379
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13380
- // and passing 'add space prefix' as bool argument
13381
- //
13382
13485
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13383
13486
 
13384
- if (special_token_rtrim) {
13385
- size_t num_whitespaces = 0;
13386
- while (isspace(raw_text[num_whitespaces])) {
13387
- num_whitespaces++;
13388
- }
13389
- if (num_whitespaces == raw_text.size()) {
13390
- continue; // skip if all whitespaces
13391
- }
13392
- raw_text = raw_text.substr(num_whitespaces);
13393
- }
13394
-
13395
13487
  if (vocab.add_space_prefix) {
13396
13488
  if (!output.size() || is_prev_special) { // prefix with space if first token
13397
13489
  raw_text = " " + raw_text;
@@ -13407,11 +13499,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13407
13499
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13408
13500
  output.push_back(fragment.token);
13409
13501
  is_prev_special = true;
13410
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
- special_token_rtrim = rtrim
13412
- && fragment.token != vocab.special_bos_id
13413
- && fragment.token != vocab.special_unk_id
13414
- && fragment.token != vocab.special_eos_id;
13415
13502
  }
13416
13503
  }
13417
13504
 
@@ -13574,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13574
13661
  const uint32_t chr) {
13575
13662
 
13576
13663
  bool found = false;
13577
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13664
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13578
13665
 
13579
13666
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13580
13667
 
@@ -13583,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13583
13670
  // inclusive range, e.g. [a-z]
13584
13671
  found = found || (pos->value <= chr && chr <= pos[1].value);
13585
13672
  pos += 2;
13673
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13674
+ // Any character matches "."
13675
+ found = true;
13676
+ pos += 1;
13586
13677
  } else {
13587
13678
  // exact char match, e.g. [a] or "a"
13588
13679
  found = found || pos->value == chr;
@@ -13600,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
13600
13691
  const llama_grammar_element * pos,
13601
13692
  const llama_partial_utf8 partial_utf8) {
13602
13693
 
13603
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13694
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13604
13695
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13605
13696
 
13606
13697
  uint32_t partial_value = partial_utf8.value;
@@ -13630,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
13630
13721
  return is_positive_char;
13631
13722
  }
13632
13723
  pos += 2;
13724
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13725
+ // Any character matches "."
13726
+ return true;
13633
13727
  } else {
13634
13728
  // exact char match, e.g. [a] or "a"
13635
13729
  if (low <= pos->value && pos->value <= high) {
@@ -13690,6 +13784,7 @@ static void llama_grammar_advance_stack(
13690
13784
  }
13691
13785
  case LLAMA_GRETYPE_CHAR:
13692
13786
  case LLAMA_GRETYPE_CHAR_NOT:
13787
+ case LLAMA_GRETYPE_CHAR_ANY:
13693
13788
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13694
13789
  // only add the stack if it's not a duplicate of one we already have
13695
13790
  new_stacks.emplace_back(stack);
@@ -14646,260 +14741,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14646
14741
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14647
14742
  }
14648
14743
 
14649
- //
14650
- // Beam search
14651
- //
14652
-
14653
- struct llama_beam {
14654
- std::vector<llama_token> tokens;
14655
- float p; // Cumulative beam probability (renormalized relative to all beams)
14656
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14657
- // Sort beams by probability. In case of ties, prefer beams at eob.
14658
- bool operator<(const llama_beam & rhs) const {
14659
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14660
- }
14661
- // Shift off first n tokens and discard them.
14662
- void shift_tokens(const size_t n) {
14663
- if (n) {
14664
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14665
- tokens.resize(tokens.size() - n);
14666
- }
14667
- }
14668
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14669
- };
14670
-
14671
- // A struct for calculating logit-related info.
14672
- struct llama_logit_info {
14673
- const float * const logits;
14674
- const int n_vocab;
14675
- const float max_l;
14676
- const float normalizer;
14677
- struct sum_exp {
14678
- float max_l;
14679
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14680
- };
14681
- llama_logit_info(llama_context * ctx)
14682
- : logits(llama_get_logits(ctx))
14683
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14684
- , max_l(*std::max_element(logits, logits + n_vocab))
14685
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14686
- { }
14687
- llama_token_data get_token_data(const llama_token token_id) const {
14688
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14689
- return {token_id, logits[token_id], p};
14690
- }
14691
- // Return top k token_data by logit.
14692
- std::vector<llama_token_data> top_k(size_t k) {
14693
- std::vector<llama_token_data> min_heap; // min-heap by logit
14694
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14695
- min_heap.reserve(k_min);
14696
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14697
- min_heap.push_back(get_token_data(token_id));
14698
- }
14699
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14700
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14701
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14702
- if (min_heap.front().logit < logits[token_id]) {
14703
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14704
- min_heap.back().id = token_id;
14705
- min_heap.back().logit = logits[token_id];
14706
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14707
- }
14708
- }
14709
- return min_heap;
14710
- }
14711
- float probability_from_logit(float logit) const {
14712
- return normalizer * std::exp(logit - max_l);
14713
- }
14714
- };
14715
-
14716
- struct llama_beam_search_data {
14717
- llama_context * ctx;
14718
- size_t n_beams;
14719
- int n_past;
14720
- int n_predict;
14721
- std::vector<llama_beam> beams;
14722
- std::vector<llama_beam> next_beams;
14723
-
14724
- // Re-calculated on each loop iteration
14725
- size_t common_prefix_length;
14726
-
14727
- // Used to communicate to/from callback on beams state.
14728
- std::vector<llama_beam_view> beam_views;
14729
-
14730
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14731
- : ctx(ctx)
14732
- , n_beams(n_beams)
14733
- , n_past(n_past)
14734
- , n_predict(n_predict)
14735
- , beam_views(n_beams) {
14736
- beams.reserve(n_beams);
14737
- next_beams.reserve(n_beams);
14738
- }
14739
-
14740
- // Collapse beams to a single beam given by index.
14741
- void collapse_beams(const size_t beam_idx) {
14742
- if (0u < beam_idx) {
14743
- std::swap(beams[0], beams[beam_idx]);
14744
- }
14745
- beams.resize(1);
14746
- }
14747
-
14748
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14749
- // The repetitive patterns below reflect the 2 stages of heaps:
14750
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14751
- // * If the heap is full and a new element is found that should be included, pop the
14752
- // least element to the back(), replace it with the new, then push it into the heap.
14753
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14754
- // Min-heaps use a greater-than comparator.
14755
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14756
- if (beam.eob) {
14757
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14758
- if (next_beams.size() < n_beams) {
14759
- next_beams.push_back(std::move(beam));
14760
- if (next_beams.size() == n_beams) {
14761
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14762
- }
14763
- } else if (next_beams.front().p < beam.p) {
14764
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14765
- next_beams.back() = std::move(beam);
14766
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14767
- }
14768
- } else {
14769
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14770
- if (!beam.tokens.empty()) {
14771
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14772
- }
14773
- llama_logit_info logit_info(ctx);
14774
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14775
-
14776
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14777
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14778
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14779
-
14780
- size_t i=0;
14781
- if (next_beams.size() < n_beams) {
14782
- for (; next_beams.size() < n_beams ; ++i) {
14783
- llama_beam next_beam = beam;
14784
- next_beam.tokens.push_back(next_tokens[i].id);
14785
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14786
- next_beams.push_back(std::move(next_beam));
14787
- }
14788
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14789
- } else {
14790
- for (; next_beams.front().p == 0.0f ; ++i) {
14791
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14792
- next_beams.back() = beam;
14793
- next_beams.back().tokens.push_back(next_tokens[i].id);
14794
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14795
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14796
- }
14797
- }
14798
- for (; i < n_beams ; ++i) {
14799
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14800
- if (next_beams.front().p < next_p) {
14801
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14802
- next_beams.back() = beam;
14803
- next_beams.back().tokens.push_back(next_tokens[i].id);
14804
- next_beams.back().p = next_p;
14805
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14806
- }
14807
- }
14808
- }
14809
- }
14810
-
14811
- // Find common_prefix_length based on beams.
14812
- // Requires beams is not empty.
14813
- size_t find_common_prefix_length() {
14814
- size_t common_prefix_length = beams[0].tokens.size();
14815
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14816
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14817
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14818
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14819
- common_prefix_length = j;
14820
- break;
14821
- }
14822
- }
14823
- }
14824
- return common_prefix_length;
14825
- }
14826
-
14827
- // Construct beams_state to send back to caller via the callback function.
14828
- // Side effect: set common_prefix_length = find_common_prefix_length();
14829
- llama_beams_state get_beams_state(const bool last_call) {
14830
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14831
- beam_views[i] = beams[i].view();
14832
- }
14833
- common_prefix_length = find_common_prefix_length();
14834
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14835
- }
14836
-
14837
- // Loop:
14838
- // * while i < n_predict, AND
14839
- // * any of the beams have not yet reached end-of-beam (eob), AND
14840
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14841
- // (since all other beam probabilities can only decrease)
14842
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14843
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14844
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14845
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14846
- !beams[top_beam_index()].eob ; ++i) {
14847
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14848
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14849
- if (common_prefix_length) {
14850
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14851
- n_past += common_prefix_length;
14852
- }
14853
- // Zero-out next_beam probabilities to place them last in following min-heap.
14854
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14855
- for (llama_beam & beam : beams) {
14856
- beam.shift_tokens(common_prefix_length);
14857
- fill_next_beams_by_top_probabilities(beam);
14858
- }
14859
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14860
- beams.swap(next_beams);
14861
- renormalize_beam_probabilities(beams);
14862
- }
14863
- collapse_beams(top_beam_index());
14864
- callback(callback_data, get_beams_state(true));
14865
- }
14866
-
14867
- // As beams grow, the cumulative probabilities decrease.
14868
- // Renormalize them to avoid floating point underflow.
14869
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14870
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14871
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14872
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14873
- }
14874
-
14875
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14876
- size_t top_beam_index() {
14877
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14878
- }
14879
-
14880
- // Copy (p,eob) for each beam which may have been changed by the callback.
14881
- void update_beams_from_beam_views() {
14882
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14883
- beams[i].p = beam_views[i].p;
14884
- beams[i].eob = beam_views[i].eob;
14885
- }
14886
- }
14887
- };
14888
-
14889
- void llama_beam_search(llama_context * ctx,
14890
- llama_beam_search_callback_fn_t callback, void * callback_data,
14891
- size_t n_beams, int n_past, int n_predict) {
14892
- assert(ctx);
14893
- const int64_t t_start_sample_us = ggml_time_us();
14894
-
14895
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14896
-
14897
- beam_search_data.loop(callback, callback_data);
14898
-
14899
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14900
- ctx->n_sample++;
14901
- }
14902
-
14903
14744
  //
14904
14745
  // quantization
14905
14746
  //
@@ -15417,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15417
15258
  if (imatrix_data) {
15418
15259
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15419
15260
  qs.has_imatrix = true;
15261
+ // check imatrix for nans or infs
15262
+ for (const auto & kv : *imatrix_data) {
15263
+ for (float f : kv.second) {
15264
+ if (!std::isfinite(f)) {
15265
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15266
+ }
15267
+ }
15268
+ }
15420
15269
  }
15421
15270
  }
15422
15271
 
@@ -16110,7 +15959,7 @@ bool llama_supports_mlock(void) {
16110
15959
  }
16111
15960
 
16112
15961
  bool llama_supports_gpu_offload(void) {
16113
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15962
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
16114
15963
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
16115
15964
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
16116
15965
  return true;
@@ -16167,7 +16016,7 @@ struct llama_model * llama_load_model_from_file(
16167
16016
  return true;
16168
16017
  };
16169
16018
  }
16170
- if (params.rpc_servers != nullptr) {
16019
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
16171
16020
  // split the servers set them into model->rpc_servers
16172
16021
  std::string servers(params.rpc_servers);
16173
16022
  size_t pos = 0;
@@ -16221,6 +16070,11 @@ struct llama_context * llama_new_context_with_model(
16221
16070
  params.flash_attn = false;
16222
16071
  }
16223
16072
 
16073
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16074
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16075
+ return nullptr;
16076
+ }
16077
+
16224
16078
  llama_context * ctx = new llama_context(*model);
16225
16079
 
16226
16080
  const auto & hparams = model->hparams;
@@ -16259,8 +16113,8 @@ struct llama_context * llama_new_context_with_model(
16259
16113
 
16260
16114
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
16261
16115
 
16262
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16263
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16116
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16117
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
16264
16118
  hparams.n_ctx_train;
16265
16119
 
16266
16120
  cparams.cb_eval = params.cb_eval;
@@ -16325,17 +16179,7 @@ struct llama_context * llama_new_context_with_model(
16325
16179
 
16326
16180
  if (!hparams.vocab_only) {
16327
16181
  // initialize backends
16328
- #if defined(GGML_USE_RPC)
16329
- for (auto & server : model->rpc_servers) {
16330
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331
- if (backend == nullptr) {
16332
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333
- llama_free(ctx);
16334
- return nullptr;
16335
- }
16336
- ctx->backends.push_back(backend);
16337
- }
16338
- #elif defined(GGML_USE_METAL)
16182
+ #if defined(GGML_USE_METAL)
16339
16183
  if (model->n_gpu_layers > 0) {
16340
16184
  ctx->backend_metal = ggml_backend_metal_init();
16341
16185
  if (ctx->backend_metal == nullptr) {
@@ -16374,7 +16218,7 @@ struct llama_context * llama_new_context_with_model(
16374
16218
  return nullptr;
16375
16219
  }
16376
16220
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16377
- ggml_backend_t backend = ggml_backend_vk_init(0);
16221
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16378
16222
  if (backend == nullptr) {
16379
16223
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16380
16224
  llama_free(ctx);
@@ -16428,6 +16272,29 @@ struct llama_context * llama_new_context_with_model(
16428
16272
  ctx->backends.push_back(backend);
16429
16273
  }
16430
16274
  #endif
16275
+
16276
+ #ifdef GGML_USE_BLAS
16277
+ ctx->backend_blas = ggml_backend_blas_init();
16278
+ if (ctx->backend_blas == nullptr) {
16279
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16280
+ } else {
16281
+ ctx->backends.push_back(ctx->backend_blas);
16282
+ }
16283
+ #endif
16284
+
16285
+ #if defined(GGML_USE_RPC)
16286
+ if (model->n_gpu_layers > 0) {
16287
+ for (const auto & endpoint : model->rpc_servers) {
16288
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16289
+ if (backend == nullptr) {
16290
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16291
+ llama_free(ctx);
16292
+ return nullptr;
16293
+ }
16294
+ ctx->backends.push_back(backend);
16295
+ }
16296
+ }
16297
+ #endif
16431
16298
  ctx->backend_cpu = ggml_backend_cpu_init();
16432
16299
  if (ctx->backend_cpu == nullptr) {
16433
16300
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -18209,9 +18076,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
18209
18076
  return model->vocab.id_to_token[token].score;
18210
18077
  }
18211
18078
 
18212
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18079
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
18213
18080
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
18214
- return model->vocab.id_to_token[token].type;
18081
+ return model->vocab.id_to_token[token].attr;
18215
18082
  }
18216
18083
 
18217
18084
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -18313,9 +18180,14 @@ static std::string llama_decode_text(const std::string & text) {
18313
18180
 
18314
18181
  // does not write null-terminator to buf
18315
18182
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18183
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18184
+ if (!special && llama_is_control_token(model->vocab, token)) {
18185
+ return 0;
18186
+ }
18187
+
18316
18188
  // if we have a cache - use it
18317
18189
  {
18318
- const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18190
+ const auto & cache = model->vocab.cache_token_to_piece;
18319
18191
 
18320
18192
  if (!cache.empty()) {
18321
18193
  const auto & res = cache.at(token);