llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -110,7 +108,7 @@
110
108
  //
111
109
 
112
110
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
111
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
112
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
113
 
116
114
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -1850,7 +1848,7 @@ struct llama_hparams {
1850
1848
  float rope_attn_factor = 1.0f;
1851
1849
  float rope_freq_base_train;
1852
1850
  float rope_freq_scale_train;
1853
- uint32_t n_yarn_orig_ctx;
1851
+ uint32_t n_ctx_orig_yarn;
1854
1852
  float rope_yarn_log_mul;
1855
1853
 
1856
1854
  // for State Space Models
@@ -1892,7 +1890,7 @@ struct llama_hparams {
1892
1890
  if (this->n_expert_shared != other.n_expert_shared) return true;
1893
1891
 
1894
1892
  if (this->rope_finetuned != other.rope_finetuned) return true;
1895
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1893
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1896
1894
 
1897
1895
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1898
1896
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1951,7 +1949,7 @@ struct llama_cparams {
1951
1949
  float rope_freq_base;
1952
1950
  float rope_freq_scale;
1953
1951
 
1954
- uint32_t n_yarn_orig_ctx;
1952
+ uint32_t n_ctx_orig_yarn;
1955
1953
  // These hyperparameters are not exposed in GGUF, because all
1956
1954
  // existing YaRN models use the same values for them.
1957
1955
  float yarn_ext_factor;
@@ -2149,12 +2147,12 @@ struct llama_control_vector {
2149
2147
  struct llama_vocab {
2150
2148
  using id = int32_t;
2151
2149
  using token = std::string;
2152
- using ttype = llama_token_type;
2150
+ using tattr = llama_token_attr;
2153
2151
 
2154
2152
  struct token_data {
2155
2153
  token text;
2156
2154
  float score;
2157
- ttype type;
2155
+ tattr attr;
2158
2156
  };
2159
2157
 
2160
2158
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2164,8 +2162,7 @@ struct llama_vocab {
2164
2162
  std::vector<token_data> id_to_token;
2165
2163
 
2166
2164
  std::vector<id> cache_special_tokens;
2167
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
- std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2165
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2169
2166
 
2170
2167
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2171
2168
 
@@ -2372,13 +2369,34 @@ struct llama_context {
2372
2369
  struct llama_control_vector cvec;
2373
2370
  };
2374
2371
 
2372
+ static size_t llama_get_device_count(const llama_model & model) {
2373
+ size_t count = 1;
2374
+ #if defined(GGML_USE_CUDA)
2375
+ count = ggml_backend_cuda_get_device_count();
2376
+ #elif defined(GGML_USE_SYCL)
2377
+ count = ggml_backend_sycl_get_device_count();
2378
+ #elif defined(GGML_USE_VULKAN)
2379
+ count = ggml_backend_vk_get_device_count();
2380
+ #endif
2381
+ #if defined(GGML_USE_RPC)
2382
+ count += model.rpc_servers.size();
2383
+ #endif
2384
+ return count;
2385
+ GGML_UNUSED(model);
2386
+ }
2387
+
2375
2388
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2376
2389
  ggml_backend_buffer_type_t buft = nullptr;
2377
2390
 
2378
- #ifdef GGML_USE_RPC
2379
- std::string endpoint = model.rpc_servers[gpu];
2380
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2381
- #elif defined(GGML_USE_METAL)
2391
+ #if defined(GGML_USE_RPC)
2392
+ int dev_count = (int)llama_get_device_count(model);
2393
+ int rpc_count = (int)model.rpc_servers.size();
2394
+ if (gpu >= dev_count - rpc_count) {
2395
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396
+ return ggml_backend_rpc_buffer_type(endpoint);
2397
+ }
2398
+ #endif
2399
+ #if defined(GGML_USE_METAL)
2382
2400
  buft = ggml_backend_metal_buffer_type();
2383
2401
  #elif defined(GGML_USE_CUDA)
2384
2402
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2386,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2386
2404
  buft = ggml_backend_vk_buffer_type(gpu);
2387
2405
  #elif defined(GGML_USE_SYCL)
2388
2406
  buft = ggml_backend_sycl_buffer_type(gpu);
2389
- #elif defined(GGML_USE_CLBLAST)
2390
- buft = ggml_backend_opencl_buffer_type();
2391
2407
  #elif defined(GGML_USE_KOMPUTE)
2392
2408
  buft = ggml_backend_kompute_buffer_type(gpu);
2393
2409
  if (buft == nullptr) {
@@ -2426,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2426
2442
  GGML_UNUSED(tensor_split);
2427
2443
  }
2428
2444
 
2429
- static size_t llama_get_device_count(const llama_model & model) {
2430
- #if defined(GGML_USE_RPC)
2431
- return model.rpc_servers.size();
2432
- #elif defined(GGML_USE_CUDA)
2433
- return ggml_backend_cuda_get_device_count();
2434
- #elif defined(GGML_USE_SYCL)
2435
- return ggml_backend_sycl_get_device_count();
2436
- #elif defined(GGML_USE_VULKAN)
2437
- return ggml_backend_vk_get_device_count();
2438
- #else
2439
- return 1;
2440
- #endif
2441
- GGML_UNUSED(model);
2442
- }
2443
-
2444
2445
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2445
2446
  #if defined(GGML_USE_RPC)
2446
- size_t total;
2447
- size_t free;
2448
- std::string endpoint = model.rpc_servers[device];
2449
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2450
- return free;
2451
- #elif defined(GGML_USE_CUDA)
2447
+ int dev_count = (int)llama_get_device_count(model);
2448
+ int rpc_count = (int)model.rpc_servers.size();
2449
+ if (device >= dev_count - rpc_count) {
2450
+ size_t total;
2451
+ size_t free;
2452
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2453
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2454
+ return free;
2455
+ }
2456
+ #endif
2457
+ #if defined(GGML_USE_CUDA)
2452
2458
  size_t total;
2453
2459
  size_t free;
2454
2460
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2520,10 +2526,6 @@ static bool llama_kv_cache_init(
2520
2526
  }
2521
2527
  }
2522
2528
 
2523
- #ifdef GGML_USE_CLBLAST
2524
- offload = false;
2525
- #endif
2526
-
2527
2529
  // count used buffer types
2528
2530
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2529
2531
  if (offload) {
@@ -4003,8 +4005,8 @@ static void llm_load_hparams(
4003
4005
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
4004
4006
  hparams.rope_finetuned = rope_finetuned;
4005
4007
 
4006
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
4007
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4008
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4009
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
4008
4010
 
4009
4011
  // rope_freq_base (optional)
4010
4012
  hparams.rope_freq_base_train = 10000.0f;
@@ -4740,7 +4742,20 @@ static void llm_load_vocab(
4740
4742
  auto & token_data = vocab.id_to_token[i];
4741
4743
  token_data.text = std::move(word);
4742
4744
  token_data.score = scores ? scores[i] : 0.0f;
4743
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4745
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4746
+
4747
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4748
+ switch(toktypes[i]) {
4749
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4750
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4751
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4752
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4753
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4754
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4755
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4756
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4757
+ }
4758
+ }
4744
4759
  }
4745
4760
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4746
4761
 
@@ -4831,7 +4846,7 @@ static void llm_load_vocab(
4831
4846
  // build special tokens cache
4832
4847
  {
4833
4848
  for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4834
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4849
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4835
4850
  vocab.cache_special_tokens.push_back(id);
4836
4851
  }
4837
4852
  }
@@ -4845,26 +4860,75 @@ static void llm_load_vocab(
4845
4860
  LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
4861
  }
4847
4862
 
4848
- // build token to piece caches
4863
+ // build token to piece cache
4849
4864
  {
4850
4865
  size_t size_cache = 0;
4851
4866
 
4852
- std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
- std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4867
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4854
4868
 
4855
4869
  for (uint32_t id = 0; id < n_vocab; ++id) {
4856
- cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
- cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4870
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4858
4871
 
4859
4872
  size_cache += cache_token_to_piece[id].size();
4860
- size_cache += cache_token_to_piece_special[id].size();
4861
4873
  }
4862
4874
 
4863
- std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
- std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4875
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4865
4876
 
4866
4877
  LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4867
4878
  }
4879
+
4880
+ // Handle per token attributes
4881
+ //NOTE: Each model customizes per token attributes.
4882
+ //NOTE: Per token attributes are missing from the GGUF file.
4883
+ //TODO: Extract attributes from GGUF file.
4884
+ {
4885
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4886
+ for (auto substr : substrs) {
4887
+ if (str.find(substr) < std::string::npos) {
4888
+ return true;
4889
+ }
4890
+ }
4891
+ return false;
4892
+ };
4893
+
4894
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4895
+ uint32_t current = vocab.id_to_token.at(id).attr;
4896
+ current = value ? (current | attr) : (current & ~attr);
4897
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4898
+ };
4899
+
4900
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4901
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4902
+ };
4903
+
4904
+ std::string model_name;
4905
+ std::string tokenizer_pre;
4906
+
4907
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4908
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4909
+
4910
+ // model name to lowercase
4911
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4912
+ [] (const std::string::value_type x) {
4913
+ return std::tolower(x);
4914
+ }
4915
+ );
4916
+
4917
+ // set attributes by model/tokenizer name
4918
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4919
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
+ for (auto id : vocab.cache_special_tokens) {
4922
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4923
+ }
4924
+ for (auto token : {"</s>"}) {
4925
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4926
+ }
4927
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4928
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4929
+ }
4930
+ }
4931
+ }
4868
4932
  }
4869
4933
 
4870
4934
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -4904,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4904
4968
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4905
4969
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4906
4970
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4907
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4971
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4908
4972
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4909
4973
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4910
4974
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -5129,12 +5193,10 @@ static bool llm_load_tensors(
5129
5193
  // output
5130
5194
  {
5131
5195
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5132
- if (model.arch != LLM_ARCH_MINICPM){
5133
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5134
- // if output is NULL, init from the input tok embed
5135
- if (model.output == NULL) {
5136
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5137
- }
5196
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5197
+ // if output is NULL, init from the input tok embed
5198
+ if (model.output == NULL) {
5199
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5138
5200
  }
5139
5201
  }
5140
5202
 
@@ -7072,7 +7134,7 @@ struct llm_build_context {
7072
7134
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
7073
7135
  const int32_t n_outputs;
7074
7136
  const int32_t kv_head; // index of where we store new KV data in the cache
7075
- const int32_t n_orig_ctx;
7137
+ const int32_t n_ctx_orig;
7076
7138
 
7077
7139
  const bool flash_attn;
7078
7140
 
@@ -7121,7 +7183,7 @@ struct llm_build_context {
7121
7183
  n_kv (worst_case ? kv_self.size : kv_self.n),
7122
7184
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
7123
7185
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
7124
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7186
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
7125
7187
  flash_attn (cparams.flash_attn),
7126
7188
  pooling_type (cparams.pooling_type),
7127
7189
  rope_type (hparams.rope_type),
@@ -7179,7 +7241,7 @@ struct llm_build_context {
7179
7241
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7180
7242
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7181
7243
  0),
7182
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7244
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7183
7245
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
7246
 
7185
7247
  cb(tmp, "K_shifted", il);
@@ -7288,7 +7350,7 @@ struct llm_build_context {
7288
7350
  // choose long/short freq factors based on the context size
7289
7351
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
7352
 
7291
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7353
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7292
7354
  return model.layers[il].rope_long;
7293
7355
  }
7294
7356
 
@@ -7404,14 +7466,14 @@ struct llm_build_context {
7404
7466
 
7405
7467
  Qcur = ggml_rope_ext(
7406
7468
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7469
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7470
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7471
  );
7410
7472
  cb(Qcur, "Qcur", il);
7411
7473
 
7412
7474
  Kcur = ggml_rope_ext(
7413
7475
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7414
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7476
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7415
7477
  ext_factor, attn_factor, beta_fast, beta_slow
7416
7478
  );
7417
7479
  cb(Kcur, "Kcur", il);
@@ -7535,12 +7597,12 @@ struct llm_build_context {
7535
7597
  case MODEL_7B:
7536
7598
  Qcur = ggml_rope_ext(
7537
7599
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7538
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7600
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7539
7601
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7602
  );
7541
7603
  Kcur = ggml_rope_ext(
7542
7604
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7543
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7544
7606
  ext_factor, attn_factor, beta_fast, beta_slow
7545
7607
  );
7546
7608
  break;
@@ -7647,14 +7709,14 @@ struct llm_build_context {
7647
7709
 
7648
7710
  Qcur = ggml_rope_ext(
7649
7711
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7650
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7712
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7651
7713
  ext_factor, attn_factor, beta_fast, beta_slow
7652
7714
  );
7653
7715
  cb(Qcur, "Qcur", il);
7654
7716
 
7655
7717
  Kcur = ggml_rope_ext(
7656
7718
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7657
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7719
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7658
7720
  ext_factor, attn_factor, beta_fast, beta_slow
7659
7721
  );
7660
7722
  cb(Kcur, "Kcur", il);
@@ -7767,13 +7829,13 @@ struct llm_build_context {
7767
7829
 
7768
7830
  // using mode = 2 for neox mode
7769
7831
  Qcur = ggml_rope_ext(
7770
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7832
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7771
7833
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7772
7834
  );
7773
7835
  cb(Qcur, "Qcur", il);
7774
7836
 
7775
7837
  Kcur = ggml_rope_ext(
7776
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7838
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7777
7839
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7778
7840
  );
7779
7841
  cb(Kcur, "Kcur", il);
@@ -7891,14 +7953,14 @@ struct llm_build_context {
7891
7953
 
7892
7954
  Qcur = ggml_rope_ext(
7893
7955
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7894
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7956
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7895
7957
  ext_factor, attn_factor, beta_fast, beta_slow
7896
7958
  );
7897
7959
  cb(Qcur, "Qcur", il);
7898
7960
 
7899
7961
  Kcur = ggml_rope_ext(
7900
7962
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7902
7964
  ext_factor, attn_factor, beta_fast, beta_slow
7903
7965
  );
7904
7966
  cb(Kcur, "Kcur", il);
@@ -8044,14 +8106,14 @@ struct llm_build_context {
8044
8106
 
8045
8107
  Qcur = ggml_rope_ext(
8046
8108
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8047
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8109
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8048
8110
  ext_factor, attn_factor, beta_fast, beta_slow
8049
8111
  );
8050
8112
  cb(Qcur, "Qcur", il);
8051
8113
 
8052
8114
  Kcur = ggml_rope_ext(
8053
8115
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8054
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8116
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8055
8117
  ext_factor, attn_factor, beta_fast, beta_slow
8056
8118
  );
8057
8119
  cb(Kcur, "Kcur", il);
@@ -8398,14 +8460,14 @@ struct llm_build_context {
8398
8460
 
8399
8461
  Qcur = ggml_rope_ext(
8400
8462
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8401
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8463
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8402
8464
  ext_factor, attn_factor, beta_fast, beta_slow
8403
8465
  );
8404
8466
  cb(Qcur, "Qcur", il);
8405
8467
 
8406
8468
  Kcur = ggml_rope_ext(
8407
8469
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8408
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8470
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8409
8471
  ext_factor, attn_factor, beta_fast, beta_slow
8410
8472
  );
8411
8473
  cb(Kcur, "Kcur", il);
@@ -8838,14 +8900,14 @@ struct llm_build_context {
8838
8900
 
8839
8901
  Qcur = ggml_rope_ext(
8840
8902
  ctx0, Qcur, inp_pos, nullptr,
8841
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8903
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8842
8904
  ext_factor, attn_factor, beta_fast, beta_slow
8843
8905
  );
8844
8906
  cb(Qcur, "Qcur", il);
8845
8907
 
8846
8908
  Kcur = ggml_rope_ext(
8847
8909
  ctx0, Kcur, inp_pos, nullptr,
8848
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8910
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8849
8911
  ext_factor, attn_factor, beta_fast, beta_slow
8850
8912
  );
8851
8913
  cb(Kcur, "Kcur", il);
@@ -8957,13 +9019,13 @@ struct llm_build_context {
8957
9019
 
8958
9020
  // using mode = 2 for neox mode
8959
9021
  Qcur = ggml_rope_ext(
8960
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9022
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8961
9023
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8962
9024
  );
8963
9025
  cb(Qcur, "Qcur", il);
8964
9026
 
8965
9027
  Kcur = ggml_rope_ext(
8966
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9028
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8967
9029
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8968
9030
  );
8969
9031
  cb(Kcur, "Kcur", il);
@@ -9069,14 +9131,14 @@ struct llm_build_context {
9069
9131
 
9070
9132
  Qcur = ggml_rope_ext(
9071
9133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9072
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9073
9135
  ext_factor, attn_factor, beta_fast, beta_slow
9074
9136
  );
9075
9137
  cb(Qcur, "Qcur", il);
9076
9138
 
9077
9139
  Kcur = ggml_rope_ext(
9078
9140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9079
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9080
9142
  ext_factor, attn_factor, beta_fast, beta_slow
9081
9143
  );
9082
9144
  cb(Kcur, "Kcur", il);
@@ -9183,14 +9245,14 @@ struct llm_build_context {
9183
9245
 
9184
9246
  Qcur = ggml_rope_ext(
9185
9247
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9186
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9248
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9187
9249
  ext_factor, attn_factor, beta_fast, beta_slow
9188
9250
  );
9189
9251
  cb(Qcur, "Qcur", il);
9190
9252
 
9191
9253
  Kcur = ggml_rope_ext(
9192
9254
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9193
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9255
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9194
9256
  ext_factor, attn_factor, beta_fast, beta_slow
9195
9257
  );
9196
9258
  cb(Kcur, "Kcur", il);
@@ -9335,7 +9397,7 @@ struct llm_build_context {
9335
9397
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9336
9398
 
9337
9399
  Qcur = ggml_rope_ext(
9338
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9400
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9339
9401
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9340
9402
  );
9341
9403
  cb(Qcur, "Qcur", il);
@@ -9346,7 +9408,7 @@ struct llm_build_context {
9346
9408
  cb(Qcur, "Qcur", il);
9347
9409
 
9348
9410
  Kcur = ggml_rope_ext(
9349
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9411
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9350
9412
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9351
9413
  );
9352
9414
  cb(Kcur, "Kcur", il);
@@ -9457,7 +9519,7 @@ struct llm_build_context {
9457
9519
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9458
9520
 
9459
9521
  Qcur = ggml_rope_ext(
9460
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9522
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9461
9523
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9462
9524
  );
9463
9525
  cb(Qcur, "Qcur", il);
@@ -9466,7 +9528,7 @@ struct llm_build_context {
9466
9528
  cb(Qcur, "Qcur", il);
9467
9529
 
9468
9530
  Kcur = ggml_rope_ext(
9469
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9531
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9470
9532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9471
9533
  );
9472
9534
  cb(Kcur, "Kcur", il);
@@ -9574,13 +9636,13 @@ struct llm_build_context {
9574
9636
 
9575
9637
  Qcur = ggml_rope_ext(
9576
9638
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9577
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9639
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9578
9640
  ext_factor, attn_factor, beta_fast, beta_slow);
9579
9641
  cb(Qcur, "Qcur", il);
9580
9642
 
9581
9643
  Kcur = ggml_rope_ext(
9582
9644
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9583
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9645
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9584
9646
  ext_factor, attn_factor, beta_fast, beta_slow);
9585
9647
  cb(Kcur, "Kcur", il);
9586
9648
 
@@ -9782,14 +9844,14 @@ struct llm_build_context {
9782
9844
 
9783
9845
  struct ggml_tensor * Qcur = ggml_rope_ext(
9784
9846
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9785
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9847
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9786
9848
  ext_factor, attn_factor, beta_fast, beta_slow
9787
9849
  );
9788
9850
  cb(Qcur, "Qcur", il);
9789
9851
 
9790
9852
  struct ggml_tensor * Kcur = ggml_rope_ext(
9791
9853
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9792
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9854
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9793
9855
  ext_factor, attn_factor, beta_fast, beta_slow
9794
9856
  );
9795
9857
  cb(Kcur, "Kcur", il);
@@ -9898,14 +9960,14 @@ struct llm_build_context {
9898
9960
 
9899
9961
  Qcur = ggml_rope_ext(
9900
9962
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9902
9964
  ext_factor, attn_factor, beta_fast, beta_slow
9903
9965
  );
9904
9966
  cb(Qcur, "Qcur", il);
9905
9967
 
9906
9968
  Kcur = ggml_rope_ext(
9907
9969
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9908
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9970
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9909
9971
  ext_factor, attn_factor, beta_fast, beta_slow
9910
9972
  );
9911
9973
  cb(Kcur, "Kcur", il);
@@ -10015,14 +10077,14 @@ struct llm_build_context {
10015
10077
 
10016
10078
  Qcur = ggml_rope_ext(
10017
10079
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10018
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10080
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10019
10081
  ext_factor, attn_factor, beta_fast, beta_slow
10020
10082
  );
10021
10083
  cb(Qcur, "Qcur", il);
10022
10084
 
10023
10085
  Kcur = ggml_rope_ext(
10024
10086
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10025
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10087
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10026
10088
  ext_factor, attn_factor, beta_fast, beta_slow
10027
10089
  );
10028
10090
  cb(Kcur, "Kcur", il);
@@ -10145,14 +10207,14 @@ struct llm_build_context {
10145
10207
 
10146
10208
  Qcur = ggml_rope_ext(
10147
10209
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10148
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10210
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10149
10211
  ext_factor, attn_factor, beta_fast, beta_slow
10150
10212
  );
10151
10213
  cb(Qcur, "Qcur", il);
10152
10214
 
10153
10215
  Kcur = ggml_rope_ext(
10154
10216
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10155
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10217
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10156
10218
  ext_factor, attn_factor, beta_fast, beta_slow
10157
10219
  );
10158
10220
  cb(Kcur, "Kcur", il);
@@ -10217,7 +10279,7 @@ struct llm_build_context {
10217
10279
  cb(cur, "lmhead_scaling", -1);
10218
10280
 
10219
10281
  // lm_head
10220
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10282
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10221
10283
  cb(cur, "result_output", -1);
10222
10284
 
10223
10285
  ggml_build_forward_expand(gf, cur);
@@ -10265,7 +10327,7 @@ struct llm_build_context {
10265
10327
 
10266
10328
  Qcur = ggml_rope_ext(
10267
10329
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10268
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10330
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10269
10331
  ext_factor, attn_factor, beta_fast, beta_slow);
10270
10332
  cb(Qcur, "Qcur", il);
10271
10333
 
@@ -10274,7 +10336,7 @@ struct llm_build_context {
10274
10336
 
10275
10337
  Kcur = ggml_rope_ext(
10276
10338
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10277
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10339
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10278
10340
  ext_factor, attn_factor, beta_fast, beta_slow);
10279
10341
  cb(Kcur, "Kcur", il);
10280
10342
 
@@ -10385,14 +10447,14 @@ struct llm_build_context {
10385
10447
 
10386
10448
  Qcur = ggml_rope_ext(
10387
10449
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10388
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10450
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10389
10451
  ext_factor, attn_factor, beta_fast, beta_slow
10390
10452
  );
10391
10453
  cb(Qcur, "Qcur", il);
10392
10454
 
10393
10455
  Kcur = ggml_rope_ext(
10394
10456
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10395
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10457
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10396
10458
  ext_factor, attn_factor, beta_fast, beta_slow
10397
10459
  );
10398
10460
  cb(Kcur, "Kcur", il);
@@ -10675,14 +10737,14 @@ struct llm_build_context {
10675
10737
 
10676
10738
  Qcur = ggml_rope_ext(
10677
10739
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10678
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10740
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10679
10741
  ext_factor, attn_factor, beta_fast, beta_slow
10680
10742
  );
10681
10743
  cb(Qcur, "Qcur", il);
10682
10744
 
10683
10745
  Kcur = ggml_rope_ext(
10684
10746
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10685
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10747
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10686
10748
  ext_factor, attn_factor, beta_fast, beta_slow
10687
10749
  );
10688
10750
  cb(Kcur, "Kcur", il);
@@ -10806,14 +10868,14 @@ struct llm_build_context {
10806
10868
 
10807
10869
  Qcur = ggml_rope_ext(
10808
10870
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10871
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10810
10872
  ext_factor, attn_factor, beta_fast, beta_slow
10811
10873
  );
10812
10874
  cb(Qcur, "Qcur", il);
10813
10875
 
10814
10876
  Kcur = ggml_rope_ext(
10815
10877
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10878
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10817
10879
  ext_factor, attn_factor, beta_fast, beta_slow
10818
10880
  );
10819
10881
  cb(Kcur, "Kcur", il);
@@ -10920,14 +10982,14 @@ struct llm_build_context {
10920
10982
 
10921
10983
  Qcur = ggml_rope_ext(
10922
10984
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10985
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10924
10986
  ext_factor, attn_factor, beta_fast, beta_slow
10925
10987
  );
10926
10988
  cb(Qcur, "Qcur", il);
10927
10989
 
10928
10990
  Kcur = ggml_rope_ext(
10929
10991
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10992
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10931
10993
  ext_factor, attn_factor, beta_fast, beta_slow
10932
10994
  );
10933
10995
  cb(Kcur, "Kcur", il);
@@ -11055,14 +11117,14 @@ struct llm_build_context {
11055
11117
 
11056
11118
  Qcur = ggml_rope_ext(
11057
11119
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
11058
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11120
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11059
11121
  ext_factor, attn_factor, beta_fast, beta_slow
11060
11122
  );
11061
11123
  cb(Qcur, "Qcur", il);
11062
11124
 
11063
11125
  Kcur = ggml_rope_ext(
11064
11126
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
11065
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11127
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11066
11128
  ext_factor, attn_factor, beta_fast, beta_slow
11067
11129
  );
11068
11130
  cb(Kcur, "Kcur", il);
@@ -11272,7 +11334,7 @@ struct llm_build_context {
11272
11334
  q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
11335
  q_pe = ggml_rope_ext(
11274
11336
  ctx0, q_pe, inp_pos, nullptr,
11275
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11337
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11276
11338
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
11339
  );
11278
11340
  cb(q_pe, "q_pe", il);
@@ -11281,7 +11343,7 @@ struct llm_build_context {
11281
11343
  k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
11344
  k_pe = ggml_rope_ext(
11283
11345
  ctx0, k_pe, inp_pos, nullptr,
11284
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11346
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11285
11347
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
11348
  );
11287
11349
  cb(k_pe, "k_pe", il);
@@ -12616,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12616
12678
 
12617
12679
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12618
12680
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12619
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12681
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12620
12682
  }
12621
12683
 
12622
12684
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12623
12685
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12624
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12686
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12625
12687
  }
12626
12688
 
12627
12689
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12628
12690
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12629
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12691
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12630
12692
  }
12631
12693
 
12632
12694
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12633
12695
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12634
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12696
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12635
12697
  }
12636
12698
 
12637
12699
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12638
12700
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12639
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12701
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12640
12702
  }
12641
12703
 
12642
12704
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -13254,7 +13316,8 @@ struct fragment_buffer_variant {
13254
13316
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13255
13317
  // for each special token
13256
13318
  for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
- const auto & special_token = vocab.id_to_token[special_id].text;
13319
+ const auto & data = vocab.id_to_token[special_id];
13320
+ const auto & special_token = data.text;
13258
13321
 
13259
13322
  // for each text fragment
13260
13323
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13291,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13291
13354
  if (match > raw_text_base_offset) {
13292
13355
  // left
13293
13356
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
13294
- const int64_t left_reminder_length = match - raw_text_base_offset;
13295
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13357
+ int64_t left_reminder_length = match - raw_text_base_offset;
13358
+
13359
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13360
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13361
+ left_reminder_length--;
13362
+ }
13363
+ }
13364
+
13365
+ if (left_reminder_length > 0) {
13366
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13367
+ it++;
13368
+ }
13296
13369
 
13297
13370
  #ifdef PRETOKENIZERDEBUG
13298
13371
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13299
13372
  #endif
13300
- it++;
13301
13373
  }
13302
13374
 
13303
13375
  // special token
@@ -13306,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13306
13378
 
13307
13379
  // right
13308
13380
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13309
- const int64_t right_reminder_offset = match + special_token.length();
13310
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13311
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13381
+ int64_t right_reminder_offset = match + special_token.length();
13382
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13383
+
13384
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13385
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13386
+ right_reminder_offset++;
13387
+ right_reminder_length--;
13388
+ }
13389
+ }
13390
+
13391
+ if (right_reminder_length > 0) {
13392
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13393
+ it++;
13394
+ }
13312
13395
 
13313
13396
  #ifdef PRETOKENIZERDEBUG
13314
13397
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13315
13398
  #endif
13316
13399
 
13317
- it++;
13318
-
13319
13400
  if (source == 0) {
13320
13401
  buffer.erase_after(buffer.before_begin());
13321
13402
  } else {
@@ -13361,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13361
13442
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13362
13443
  // tokenizer.encode('', add_special_tokens=False) returns []
13363
13444
 
13364
- static const bool rtrim = true; //TODO: as param
13365
13445
  bool is_prev_special = false;
13366
- bool special_token_rtrim = false;
13367
13446
 
13368
13447
  if (add_special && vocab.special_add_bos != 0) {
13369
13448
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13373,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13373
13452
 
13374
13453
  for (const auto & fragment : fragment_buffer) {
13375
13454
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13376
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13377
-
13378
- // TODO: It's likely possible to get rid of this string copy entirely
13379
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13380
- // and passing 'add space prefix' as bool argument
13381
- //
13382
13455
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13383
13456
 
13384
- if (special_token_rtrim) {
13385
- size_t num_whitespaces = 0;
13386
- while (isspace(raw_text[num_whitespaces])) {
13387
- num_whitespaces++;
13388
- }
13389
- if (num_whitespaces == raw_text.size()) {
13390
- continue; // skip if all whitespaces
13391
- }
13392
- raw_text = raw_text.substr(num_whitespaces);
13393
- }
13394
-
13395
13457
  if (vocab.add_space_prefix) {
13396
13458
  if (!output.size() || is_prev_special) { // prefix with space if first token
13397
13459
  raw_text = " " + raw_text;
@@ -13407,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13407
13469
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13408
13470
  output.push_back(fragment.token);
13409
13471
  is_prev_special = true;
13410
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
- special_token_rtrim = rtrim
13412
- && fragment.token != vocab.special_bos_id
13413
- && fragment.token != vocab.special_unk_id
13414
- && fragment.token != vocab.special_eos_id;
13415
13472
  }
13416
13473
  }
13417
13474
 
@@ -14646,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14646
14703
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14647
14704
  }
14648
14705
 
14649
- //
14650
- // Beam search
14651
- //
14652
-
14653
- struct llama_beam {
14654
- std::vector<llama_token> tokens;
14655
- float p; // Cumulative beam probability (renormalized relative to all beams)
14656
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14657
- // Sort beams by probability. In case of ties, prefer beams at eob.
14658
- bool operator<(const llama_beam & rhs) const {
14659
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14660
- }
14661
- // Shift off first n tokens and discard them.
14662
- void shift_tokens(const size_t n) {
14663
- if (n) {
14664
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14665
- tokens.resize(tokens.size() - n);
14666
- }
14667
- }
14668
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14669
- };
14670
-
14671
- // A struct for calculating logit-related info.
14672
- struct llama_logit_info {
14673
- const float * const logits;
14674
- const int n_vocab;
14675
- const float max_l;
14676
- const float normalizer;
14677
- struct sum_exp {
14678
- float max_l;
14679
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14680
- };
14681
- llama_logit_info(llama_context * ctx)
14682
- : logits(llama_get_logits(ctx))
14683
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14684
- , max_l(*std::max_element(logits, logits + n_vocab))
14685
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14686
- { }
14687
- llama_token_data get_token_data(const llama_token token_id) const {
14688
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14689
- return {token_id, logits[token_id], p};
14690
- }
14691
- // Return top k token_data by logit.
14692
- std::vector<llama_token_data> top_k(size_t k) {
14693
- std::vector<llama_token_data> min_heap; // min-heap by logit
14694
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14695
- min_heap.reserve(k_min);
14696
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14697
- min_heap.push_back(get_token_data(token_id));
14698
- }
14699
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14700
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14701
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14702
- if (min_heap.front().logit < logits[token_id]) {
14703
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14704
- min_heap.back().id = token_id;
14705
- min_heap.back().logit = logits[token_id];
14706
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14707
- }
14708
- }
14709
- return min_heap;
14710
- }
14711
- float probability_from_logit(float logit) const {
14712
- return normalizer * std::exp(logit - max_l);
14713
- }
14714
- };
14715
-
14716
- struct llama_beam_search_data {
14717
- llama_context * ctx;
14718
- size_t n_beams;
14719
- int n_past;
14720
- int n_predict;
14721
- std::vector<llama_beam> beams;
14722
- std::vector<llama_beam> next_beams;
14723
-
14724
- // Re-calculated on each loop iteration
14725
- size_t common_prefix_length;
14726
-
14727
- // Used to communicate to/from callback on beams state.
14728
- std::vector<llama_beam_view> beam_views;
14729
-
14730
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14731
- : ctx(ctx)
14732
- , n_beams(n_beams)
14733
- , n_past(n_past)
14734
- , n_predict(n_predict)
14735
- , beam_views(n_beams) {
14736
- beams.reserve(n_beams);
14737
- next_beams.reserve(n_beams);
14738
- }
14739
-
14740
- // Collapse beams to a single beam given by index.
14741
- void collapse_beams(const size_t beam_idx) {
14742
- if (0u < beam_idx) {
14743
- std::swap(beams[0], beams[beam_idx]);
14744
- }
14745
- beams.resize(1);
14746
- }
14747
-
14748
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14749
- // The repetitive patterns below reflect the 2 stages of heaps:
14750
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14751
- // * If the heap is full and a new element is found that should be included, pop the
14752
- // least element to the back(), replace it with the new, then push it into the heap.
14753
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14754
- // Min-heaps use a greater-than comparator.
14755
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14756
- if (beam.eob) {
14757
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14758
- if (next_beams.size() < n_beams) {
14759
- next_beams.push_back(std::move(beam));
14760
- if (next_beams.size() == n_beams) {
14761
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14762
- }
14763
- } else if (next_beams.front().p < beam.p) {
14764
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14765
- next_beams.back() = std::move(beam);
14766
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14767
- }
14768
- } else {
14769
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14770
- if (!beam.tokens.empty()) {
14771
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14772
- }
14773
- llama_logit_info logit_info(ctx);
14774
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14775
-
14776
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14777
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14778
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14779
-
14780
- size_t i=0;
14781
- if (next_beams.size() < n_beams) {
14782
- for (; next_beams.size() < n_beams ; ++i) {
14783
- llama_beam next_beam = beam;
14784
- next_beam.tokens.push_back(next_tokens[i].id);
14785
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14786
- next_beams.push_back(std::move(next_beam));
14787
- }
14788
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14789
- } else {
14790
- for (; next_beams.front().p == 0.0f ; ++i) {
14791
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14792
- next_beams.back() = beam;
14793
- next_beams.back().tokens.push_back(next_tokens[i].id);
14794
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14795
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14796
- }
14797
- }
14798
- for (; i < n_beams ; ++i) {
14799
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14800
- if (next_beams.front().p < next_p) {
14801
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14802
- next_beams.back() = beam;
14803
- next_beams.back().tokens.push_back(next_tokens[i].id);
14804
- next_beams.back().p = next_p;
14805
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14806
- }
14807
- }
14808
- }
14809
- }
14810
-
14811
- // Find common_prefix_length based on beams.
14812
- // Requires beams is not empty.
14813
- size_t find_common_prefix_length() {
14814
- size_t common_prefix_length = beams[0].tokens.size();
14815
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14816
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14817
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14818
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14819
- common_prefix_length = j;
14820
- break;
14821
- }
14822
- }
14823
- }
14824
- return common_prefix_length;
14825
- }
14826
-
14827
- // Construct beams_state to send back to caller via the callback function.
14828
- // Side effect: set common_prefix_length = find_common_prefix_length();
14829
- llama_beams_state get_beams_state(const bool last_call) {
14830
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14831
- beam_views[i] = beams[i].view();
14832
- }
14833
- common_prefix_length = find_common_prefix_length();
14834
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14835
- }
14836
-
14837
- // Loop:
14838
- // * while i < n_predict, AND
14839
- // * any of the beams have not yet reached end-of-beam (eob), AND
14840
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14841
- // (since all other beam probabilities can only decrease)
14842
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14843
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14844
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14845
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14846
- !beams[top_beam_index()].eob ; ++i) {
14847
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14848
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14849
- if (common_prefix_length) {
14850
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14851
- n_past += common_prefix_length;
14852
- }
14853
- // Zero-out next_beam probabilities to place them last in following min-heap.
14854
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14855
- for (llama_beam & beam : beams) {
14856
- beam.shift_tokens(common_prefix_length);
14857
- fill_next_beams_by_top_probabilities(beam);
14858
- }
14859
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14860
- beams.swap(next_beams);
14861
- renormalize_beam_probabilities(beams);
14862
- }
14863
- collapse_beams(top_beam_index());
14864
- callback(callback_data, get_beams_state(true));
14865
- }
14866
-
14867
- // As beams grow, the cumulative probabilities decrease.
14868
- // Renormalize them to avoid floating point underflow.
14869
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14870
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14871
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14872
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14873
- }
14874
-
14875
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14876
- size_t top_beam_index() {
14877
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14878
- }
14879
-
14880
- // Copy (p,eob) for each beam which may have been changed by the callback.
14881
- void update_beams_from_beam_views() {
14882
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14883
- beams[i].p = beam_views[i].p;
14884
- beams[i].eob = beam_views[i].eob;
14885
- }
14886
- }
14887
- };
14888
-
14889
- void llama_beam_search(llama_context * ctx,
14890
- llama_beam_search_callback_fn_t callback, void * callback_data,
14891
- size_t n_beams, int n_past, int n_predict) {
14892
- assert(ctx);
14893
- const int64_t t_start_sample_us = ggml_time_us();
14894
-
14895
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14896
-
14897
- beam_search_data.loop(callback, callback_data);
14898
-
14899
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14900
- ctx->n_sample++;
14901
- }
14902
-
14903
14706
  //
14904
14707
  // quantization
14905
14708
  //
@@ -16110,7 +15913,7 @@ bool llama_supports_mlock(void) {
16110
15913
  }
16111
15914
 
16112
15915
  bool llama_supports_gpu_offload(void) {
16113
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15916
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
16114
15917
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
16115
15918
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
16116
15919
  return true;
@@ -16167,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
16167
15970
  return true;
16168
15971
  };
16169
15972
  }
16170
- if (params.rpc_servers != nullptr) {
15973
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
16171
15974
  // split the servers set them into model->rpc_servers
16172
15975
  std::string servers(params.rpc_servers);
16173
15976
  size_t pos = 0;
@@ -16221,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
16221
16024
  params.flash_attn = false;
16222
16025
  }
16223
16026
 
16027
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
+ return nullptr;
16030
+ }
16031
+
16224
16032
  llama_context * ctx = new llama_context(*model);
16225
16033
 
16226
16034
  const auto & hparams = model->hparams;
@@ -16259,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
16259
16067
 
16260
16068
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
16261
16069
 
16262
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16263
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16070
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16071
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
16264
16072
  hparams.n_ctx_train;
16265
16073
 
16266
16074
  cparams.cb_eval = params.cb_eval;
@@ -16325,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
16325
16133
 
16326
16134
  if (!hparams.vocab_only) {
16327
16135
  // initialize backends
16328
- #if defined(GGML_USE_RPC)
16329
- for (auto & server : model->rpc_servers) {
16330
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331
- if (backend == nullptr) {
16332
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333
- llama_free(ctx);
16334
- return nullptr;
16335
- }
16336
- ctx->backends.push_back(backend);
16337
- }
16338
- #elif defined(GGML_USE_METAL)
16136
+ #if defined(GGML_USE_METAL)
16339
16137
  if (model->n_gpu_layers > 0) {
16340
16138
  ctx->backend_metal = ggml_backend_metal_init();
16341
16139
  if (ctx->backend_metal == nullptr) {
@@ -16374,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
16374
16172
  return nullptr;
16375
16173
  }
16376
16174
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16377
- ggml_backend_t backend = ggml_backend_vk_init(0);
16175
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16378
16176
  if (backend == nullptr) {
16379
16177
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16380
16178
  llama_free(ctx);
@@ -16427,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
16427
16225
  }
16428
16226
  ctx->backends.push_back(backend);
16429
16227
  }
16228
+ #endif
16229
+ #if defined(GGML_USE_RPC)
16230
+ if (model->n_gpu_layers > 0) {
16231
+ for (const auto & endpoint : model->rpc_servers) {
16232
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16233
+ if (backend == nullptr) {
16234
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16235
+ llama_free(ctx);
16236
+ return nullptr;
16237
+ }
16238
+ ctx->backends.push_back(backend);
16239
+ }
16240
+ }
16430
16241
  #endif
16431
16242
  ctx->backend_cpu = ggml_backend_cpu_init();
16432
16243
  if (ctx->backend_cpu == nullptr) {
@@ -18209,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
18209
18020
  return model->vocab.id_to_token[token].score;
18210
18021
  }
18211
18022
 
18212
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18023
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
18213
18024
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
18214
- return model->vocab.id_to_token[token].type;
18025
+ return model->vocab.id_to_token[token].attr;
18215
18026
  }
18216
18027
 
18217
18028
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -18313,9 +18124,14 @@ static std::string llama_decode_text(const std::string & text) {
18313
18124
 
18314
18125
  // does not write null-terminator to buf
18315
18126
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18127
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18128
+ if (!special && llama_is_control_token(model->vocab, token)) {
18129
+ return 0;
18130
+ }
18131
+
18316
18132
  // if we have a cache - use it
18317
18133
  {
18318
- const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18134
+ const auto & cache = model->vocab.cache_token_to_piece;
18319
18135
 
18320
18136
  if (!cache.empty()) {
18321
18137
  const auto & res = cache.at(token);