llama_cpp 0.15.4 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -110,7 +108,7 @@
110
108
  //
111
109
 
112
110
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
111
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
112
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
113
 
116
114
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -1850,7 +1848,7 @@ struct llama_hparams {
1850
1848
  float rope_attn_factor = 1.0f;
1851
1849
  float rope_freq_base_train;
1852
1850
  float rope_freq_scale_train;
1853
- uint32_t n_yarn_orig_ctx;
1851
+ uint32_t n_ctx_orig_yarn;
1854
1852
  float rope_yarn_log_mul;
1855
1853
 
1856
1854
  // for State Space Models
@@ -1892,7 +1890,7 @@ struct llama_hparams {
1892
1890
  if (this->n_expert_shared != other.n_expert_shared) return true;
1893
1891
 
1894
1892
  if (this->rope_finetuned != other.rope_finetuned) return true;
1895
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1893
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1896
1894
 
1897
1895
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1898
1896
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1951,7 +1949,7 @@ struct llama_cparams {
1951
1949
  float rope_freq_base;
1952
1950
  float rope_freq_scale;
1953
1951
 
1954
- uint32_t n_yarn_orig_ctx;
1952
+ uint32_t n_ctx_orig_yarn;
1955
1953
  // These hyperparameters are not exposed in GGUF, because all
1956
1954
  // existing YaRN models use the same values for them.
1957
1955
  float yarn_ext_factor;
@@ -2149,12 +2147,12 @@ struct llama_control_vector {
2149
2147
  struct llama_vocab {
2150
2148
  using id = int32_t;
2151
2149
  using token = std::string;
2152
- using ttype = llama_token_type;
2150
+ using tattr = llama_token_attr;
2153
2151
 
2154
2152
  struct token_data {
2155
2153
  token text;
2156
2154
  float score;
2157
- ttype type;
2155
+ tattr attr;
2158
2156
  };
2159
2157
 
2160
2158
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2164,8 +2162,7 @@ struct llama_vocab {
2164
2162
  std::vector<token_data> id_to_token;
2165
2163
 
2166
2164
  std::vector<id> cache_special_tokens;
2167
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
- std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2165
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2169
2166
 
2170
2167
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2171
2168
 
@@ -2372,13 +2369,34 @@ struct llama_context {
2372
2369
  struct llama_control_vector cvec;
2373
2370
  };
2374
2371
 
2372
+ static size_t llama_get_device_count(const llama_model & model) {
2373
+ size_t count = 1;
2374
+ #if defined(GGML_USE_CUDA)
2375
+ count = ggml_backend_cuda_get_device_count();
2376
+ #elif defined(GGML_USE_SYCL)
2377
+ count = ggml_backend_sycl_get_device_count();
2378
+ #elif defined(GGML_USE_VULKAN)
2379
+ count = ggml_backend_vk_get_device_count();
2380
+ #endif
2381
+ #if defined(GGML_USE_RPC)
2382
+ count += model.rpc_servers.size();
2383
+ #endif
2384
+ return count;
2385
+ GGML_UNUSED(model);
2386
+ }
2387
+
2375
2388
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2376
2389
  ggml_backend_buffer_type_t buft = nullptr;
2377
2390
 
2378
- #ifdef GGML_USE_RPC
2379
- std::string endpoint = model.rpc_servers[gpu];
2380
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2381
- #elif defined(GGML_USE_METAL)
2391
+ #if defined(GGML_USE_RPC)
2392
+ int dev_count = (int)llama_get_device_count(model);
2393
+ int rpc_count = (int)model.rpc_servers.size();
2394
+ if (gpu >= dev_count - rpc_count) {
2395
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396
+ return ggml_backend_rpc_buffer_type(endpoint);
2397
+ }
2398
+ #endif
2399
+ #if defined(GGML_USE_METAL)
2382
2400
  buft = ggml_backend_metal_buffer_type();
2383
2401
  #elif defined(GGML_USE_CUDA)
2384
2402
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2386,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2386
2404
  buft = ggml_backend_vk_buffer_type(gpu);
2387
2405
  #elif defined(GGML_USE_SYCL)
2388
2406
  buft = ggml_backend_sycl_buffer_type(gpu);
2389
- #elif defined(GGML_USE_CLBLAST)
2390
- buft = ggml_backend_opencl_buffer_type();
2391
2407
  #elif defined(GGML_USE_KOMPUTE)
2392
2408
  buft = ggml_backend_kompute_buffer_type(gpu);
2393
2409
  if (buft == nullptr) {
@@ -2426,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2426
2442
  GGML_UNUSED(tensor_split);
2427
2443
  }
2428
2444
 
2429
- static size_t llama_get_device_count(const llama_model & model) {
2430
- #if defined(GGML_USE_RPC)
2431
- return model.rpc_servers.size();
2432
- #elif defined(GGML_USE_CUDA)
2433
- return ggml_backend_cuda_get_device_count();
2434
- #elif defined(GGML_USE_SYCL)
2435
- return ggml_backend_sycl_get_device_count();
2436
- #elif defined(GGML_USE_VULKAN)
2437
- return ggml_backend_vk_get_device_count();
2438
- #else
2439
- return 1;
2440
- #endif
2441
- GGML_UNUSED(model);
2442
- }
2443
-
2444
2445
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2445
2446
  #if defined(GGML_USE_RPC)
2446
- size_t total;
2447
- size_t free;
2448
- std::string endpoint = model.rpc_servers[device];
2449
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2450
- return free;
2451
- #elif defined(GGML_USE_CUDA)
2447
+ int dev_count = (int)llama_get_device_count(model);
2448
+ int rpc_count = (int)model.rpc_servers.size();
2449
+ if (device >= dev_count - rpc_count) {
2450
+ size_t total;
2451
+ size_t free;
2452
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2453
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2454
+ return free;
2455
+ }
2456
+ #endif
2457
+ #if defined(GGML_USE_CUDA)
2452
2458
  size_t total;
2453
2459
  size_t free;
2454
2460
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2520,10 +2526,6 @@ static bool llama_kv_cache_init(
2520
2526
  }
2521
2527
  }
2522
2528
 
2523
- #ifdef GGML_USE_CLBLAST
2524
- offload = false;
2525
- #endif
2526
-
2527
2529
  // count used buffer types
2528
2530
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2529
2531
  if (offload) {
@@ -4003,8 +4005,8 @@ static void llm_load_hparams(
4003
4005
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
4004
4006
  hparams.rope_finetuned = rope_finetuned;
4005
4007
 
4006
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
4007
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4008
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4009
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
4008
4010
 
4009
4011
  // rope_freq_base (optional)
4010
4012
  hparams.rope_freq_base_train = 10000.0f;
@@ -4740,7 +4742,20 @@ static void llm_load_vocab(
4740
4742
  auto & token_data = vocab.id_to_token[i];
4741
4743
  token_data.text = std::move(word);
4742
4744
  token_data.score = scores ? scores[i] : 0.0f;
4743
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4745
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4746
+
4747
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4748
+ switch(toktypes[i]) {
4749
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4750
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4751
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4752
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4753
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4754
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4755
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4756
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4757
+ }
4758
+ }
4744
4759
  }
4745
4760
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4746
4761
 
@@ -4831,7 +4846,7 @@ static void llm_load_vocab(
4831
4846
  // build special tokens cache
4832
4847
  {
4833
4848
  for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4834
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4849
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4835
4850
  vocab.cache_special_tokens.push_back(id);
4836
4851
  }
4837
4852
  }
@@ -4845,26 +4860,75 @@ static void llm_load_vocab(
4845
4860
  LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
4861
  }
4847
4862
 
4848
- // build token to piece caches
4863
+ // build token to piece cache
4849
4864
  {
4850
4865
  size_t size_cache = 0;
4851
4866
 
4852
- std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
- std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4867
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4854
4868
 
4855
4869
  for (uint32_t id = 0; id < n_vocab; ++id) {
4856
- cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
- cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4870
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4858
4871
 
4859
4872
  size_cache += cache_token_to_piece[id].size();
4860
- size_cache += cache_token_to_piece_special[id].size();
4861
4873
  }
4862
4874
 
4863
- std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
- std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4875
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4865
4876
 
4866
4877
  LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4867
4878
  }
4879
+
4880
+ // Handle per token attributes
4881
+ //NOTE: Each model customizes per token attributes.
4882
+ //NOTE: Per token attributes are missing from the GGUF file.
4883
+ //TODO: Extract attributes from GGUF file.
4884
+ {
4885
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4886
+ for (auto substr : substrs) {
4887
+ if (str.find(substr) < std::string::npos) {
4888
+ return true;
4889
+ }
4890
+ }
4891
+ return false;
4892
+ };
4893
+
4894
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4895
+ uint32_t current = vocab.id_to_token.at(id).attr;
4896
+ current = value ? (current | attr) : (current & ~attr);
4897
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4898
+ };
4899
+
4900
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4901
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4902
+ };
4903
+
4904
+ std::string model_name;
4905
+ std::string tokenizer_pre;
4906
+
4907
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4908
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4909
+
4910
+ // model name to lowercase
4911
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4912
+ [] (const std::string::value_type x) {
4913
+ return std::tolower(x);
4914
+ }
4915
+ );
4916
+
4917
+ // set attributes by model/tokenizer name
4918
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4919
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
+ for (auto id : vocab.cache_special_tokens) {
4922
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4923
+ }
4924
+ for (auto token : {"</s>"}) {
4925
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4926
+ }
4927
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4928
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4929
+ }
4930
+ }
4931
+ }
4868
4932
  }
4869
4933
 
4870
4934
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -4904,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4904
4968
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4905
4969
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4906
4970
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4907
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4971
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4908
4972
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4909
4973
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4910
4974
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -5129,12 +5193,10 @@ static bool llm_load_tensors(
5129
5193
  // output
5130
5194
  {
5131
5195
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5132
- if (model.arch != LLM_ARCH_MINICPM){
5133
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5134
- // if output is NULL, init from the input tok embed
5135
- if (model.output == NULL) {
5136
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5137
- }
5196
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5197
+ // if output is NULL, init from the input tok embed
5198
+ if (model.output == NULL) {
5199
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5138
5200
  }
5139
5201
  }
5140
5202
 
@@ -7072,7 +7134,7 @@ struct llm_build_context {
7072
7134
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
7073
7135
  const int32_t n_outputs;
7074
7136
  const int32_t kv_head; // index of where we store new KV data in the cache
7075
- const int32_t n_orig_ctx;
7137
+ const int32_t n_ctx_orig;
7076
7138
 
7077
7139
  const bool flash_attn;
7078
7140
 
@@ -7121,7 +7183,7 @@ struct llm_build_context {
7121
7183
  n_kv (worst_case ? kv_self.size : kv_self.n),
7122
7184
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
7123
7185
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
7124
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7186
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
7125
7187
  flash_attn (cparams.flash_attn),
7126
7188
  pooling_type (cparams.pooling_type),
7127
7189
  rope_type (hparams.rope_type),
@@ -7179,7 +7241,7 @@ struct llm_build_context {
7179
7241
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7180
7242
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7181
7243
  0),
7182
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7244
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7183
7245
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
7246
 
7185
7247
  cb(tmp, "K_shifted", il);
@@ -7288,7 +7350,7 @@ struct llm_build_context {
7288
7350
  // choose long/short freq factors based on the context size
7289
7351
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
7352
 
7291
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7353
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7292
7354
  return model.layers[il].rope_long;
7293
7355
  }
7294
7356
 
@@ -7404,14 +7466,14 @@ struct llm_build_context {
7404
7466
 
7405
7467
  Qcur = ggml_rope_ext(
7406
7468
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7469
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7470
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7471
  );
7410
7472
  cb(Qcur, "Qcur", il);
7411
7473
 
7412
7474
  Kcur = ggml_rope_ext(
7413
7475
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7414
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7476
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7415
7477
  ext_factor, attn_factor, beta_fast, beta_slow
7416
7478
  );
7417
7479
  cb(Kcur, "Kcur", il);
@@ -7535,12 +7597,12 @@ struct llm_build_context {
7535
7597
  case MODEL_7B:
7536
7598
  Qcur = ggml_rope_ext(
7537
7599
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7538
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7600
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7539
7601
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7602
  );
7541
7603
  Kcur = ggml_rope_ext(
7542
7604
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7543
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7544
7606
  ext_factor, attn_factor, beta_fast, beta_slow
7545
7607
  );
7546
7608
  break;
@@ -7647,14 +7709,14 @@ struct llm_build_context {
7647
7709
 
7648
7710
  Qcur = ggml_rope_ext(
7649
7711
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7650
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7712
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7651
7713
  ext_factor, attn_factor, beta_fast, beta_slow
7652
7714
  );
7653
7715
  cb(Qcur, "Qcur", il);
7654
7716
 
7655
7717
  Kcur = ggml_rope_ext(
7656
7718
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7657
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7719
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7658
7720
  ext_factor, attn_factor, beta_fast, beta_slow
7659
7721
  );
7660
7722
  cb(Kcur, "Kcur", il);
@@ -7767,13 +7829,13 @@ struct llm_build_context {
7767
7829
 
7768
7830
  // using mode = 2 for neox mode
7769
7831
  Qcur = ggml_rope_ext(
7770
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7832
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7771
7833
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7772
7834
  );
7773
7835
  cb(Qcur, "Qcur", il);
7774
7836
 
7775
7837
  Kcur = ggml_rope_ext(
7776
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7838
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7777
7839
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7778
7840
  );
7779
7841
  cb(Kcur, "Kcur", il);
@@ -7891,14 +7953,14 @@ struct llm_build_context {
7891
7953
 
7892
7954
  Qcur = ggml_rope_ext(
7893
7955
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7894
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7956
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7895
7957
  ext_factor, attn_factor, beta_fast, beta_slow
7896
7958
  );
7897
7959
  cb(Qcur, "Qcur", il);
7898
7960
 
7899
7961
  Kcur = ggml_rope_ext(
7900
7962
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7902
7964
  ext_factor, attn_factor, beta_fast, beta_slow
7903
7965
  );
7904
7966
  cb(Kcur, "Kcur", il);
@@ -8044,14 +8106,14 @@ struct llm_build_context {
8044
8106
 
8045
8107
  Qcur = ggml_rope_ext(
8046
8108
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8047
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8109
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8048
8110
  ext_factor, attn_factor, beta_fast, beta_slow
8049
8111
  );
8050
8112
  cb(Qcur, "Qcur", il);
8051
8113
 
8052
8114
  Kcur = ggml_rope_ext(
8053
8115
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8054
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8116
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8055
8117
  ext_factor, attn_factor, beta_fast, beta_slow
8056
8118
  );
8057
8119
  cb(Kcur, "Kcur", il);
@@ -8398,14 +8460,14 @@ struct llm_build_context {
8398
8460
 
8399
8461
  Qcur = ggml_rope_ext(
8400
8462
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8401
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8463
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8402
8464
  ext_factor, attn_factor, beta_fast, beta_slow
8403
8465
  );
8404
8466
  cb(Qcur, "Qcur", il);
8405
8467
 
8406
8468
  Kcur = ggml_rope_ext(
8407
8469
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8408
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8470
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8409
8471
  ext_factor, attn_factor, beta_fast, beta_slow
8410
8472
  );
8411
8473
  cb(Kcur, "Kcur", il);
@@ -8838,14 +8900,14 @@ struct llm_build_context {
8838
8900
 
8839
8901
  Qcur = ggml_rope_ext(
8840
8902
  ctx0, Qcur, inp_pos, nullptr,
8841
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8903
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8842
8904
  ext_factor, attn_factor, beta_fast, beta_slow
8843
8905
  );
8844
8906
  cb(Qcur, "Qcur", il);
8845
8907
 
8846
8908
  Kcur = ggml_rope_ext(
8847
8909
  ctx0, Kcur, inp_pos, nullptr,
8848
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8910
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8849
8911
  ext_factor, attn_factor, beta_fast, beta_slow
8850
8912
  );
8851
8913
  cb(Kcur, "Kcur", il);
@@ -8957,13 +9019,13 @@ struct llm_build_context {
8957
9019
 
8958
9020
  // using mode = 2 for neox mode
8959
9021
  Qcur = ggml_rope_ext(
8960
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9022
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8961
9023
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8962
9024
  );
8963
9025
  cb(Qcur, "Qcur", il);
8964
9026
 
8965
9027
  Kcur = ggml_rope_ext(
8966
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9028
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8967
9029
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8968
9030
  );
8969
9031
  cb(Kcur, "Kcur", il);
@@ -9069,14 +9131,14 @@ struct llm_build_context {
9069
9131
 
9070
9132
  Qcur = ggml_rope_ext(
9071
9133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9072
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9073
9135
  ext_factor, attn_factor, beta_fast, beta_slow
9074
9136
  );
9075
9137
  cb(Qcur, "Qcur", il);
9076
9138
 
9077
9139
  Kcur = ggml_rope_ext(
9078
9140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9079
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9080
9142
  ext_factor, attn_factor, beta_fast, beta_slow
9081
9143
  );
9082
9144
  cb(Kcur, "Kcur", il);
@@ -9183,14 +9245,14 @@ struct llm_build_context {
9183
9245
 
9184
9246
  Qcur = ggml_rope_ext(
9185
9247
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9186
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9248
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9187
9249
  ext_factor, attn_factor, beta_fast, beta_slow
9188
9250
  );
9189
9251
  cb(Qcur, "Qcur", il);
9190
9252
 
9191
9253
  Kcur = ggml_rope_ext(
9192
9254
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9193
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9255
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9194
9256
  ext_factor, attn_factor, beta_fast, beta_slow
9195
9257
  );
9196
9258
  cb(Kcur, "Kcur", il);
@@ -9335,7 +9397,7 @@ struct llm_build_context {
9335
9397
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9336
9398
 
9337
9399
  Qcur = ggml_rope_ext(
9338
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9400
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9339
9401
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9340
9402
  );
9341
9403
  cb(Qcur, "Qcur", il);
@@ -9346,7 +9408,7 @@ struct llm_build_context {
9346
9408
  cb(Qcur, "Qcur", il);
9347
9409
 
9348
9410
  Kcur = ggml_rope_ext(
9349
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9411
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9350
9412
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9351
9413
  );
9352
9414
  cb(Kcur, "Kcur", il);
@@ -9457,7 +9519,7 @@ struct llm_build_context {
9457
9519
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9458
9520
 
9459
9521
  Qcur = ggml_rope_ext(
9460
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9522
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9461
9523
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9462
9524
  );
9463
9525
  cb(Qcur, "Qcur", il);
@@ -9466,7 +9528,7 @@ struct llm_build_context {
9466
9528
  cb(Qcur, "Qcur", il);
9467
9529
 
9468
9530
  Kcur = ggml_rope_ext(
9469
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9531
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9470
9532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9471
9533
  );
9472
9534
  cb(Kcur, "Kcur", il);
@@ -9574,13 +9636,13 @@ struct llm_build_context {
9574
9636
 
9575
9637
  Qcur = ggml_rope_ext(
9576
9638
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9577
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9639
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9578
9640
  ext_factor, attn_factor, beta_fast, beta_slow);
9579
9641
  cb(Qcur, "Qcur", il);
9580
9642
 
9581
9643
  Kcur = ggml_rope_ext(
9582
9644
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9583
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9645
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9584
9646
  ext_factor, attn_factor, beta_fast, beta_slow);
9585
9647
  cb(Kcur, "Kcur", il);
9586
9648
 
@@ -9782,14 +9844,14 @@ struct llm_build_context {
9782
9844
 
9783
9845
  struct ggml_tensor * Qcur = ggml_rope_ext(
9784
9846
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9785
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9847
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9786
9848
  ext_factor, attn_factor, beta_fast, beta_slow
9787
9849
  );
9788
9850
  cb(Qcur, "Qcur", il);
9789
9851
 
9790
9852
  struct ggml_tensor * Kcur = ggml_rope_ext(
9791
9853
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9792
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9854
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9793
9855
  ext_factor, attn_factor, beta_fast, beta_slow
9794
9856
  );
9795
9857
  cb(Kcur, "Kcur", il);
@@ -9898,14 +9960,14 @@ struct llm_build_context {
9898
9960
 
9899
9961
  Qcur = ggml_rope_ext(
9900
9962
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9901
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9902
9964
  ext_factor, attn_factor, beta_fast, beta_slow
9903
9965
  );
9904
9966
  cb(Qcur, "Qcur", il);
9905
9967
 
9906
9968
  Kcur = ggml_rope_ext(
9907
9969
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9908
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9970
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9909
9971
  ext_factor, attn_factor, beta_fast, beta_slow
9910
9972
  );
9911
9973
  cb(Kcur, "Kcur", il);
@@ -10015,14 +10077,14 @@ struct llm_build_context {
10015
10077
 
10016
10078
  Qcur = ggml_rope_ext(
10017
10079
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10018
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10080
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10019
10081
  ext_factor, attn_factor, beta_fast, beta_slow
10020
10082
  );
10021
10083
  cb(Qcur, "Qcur", il);
10022
10084
 
10023
10085
  Kcur = ggml_rope_ext(
10024
10086
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10025
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10087
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10026
10088
  ext_factor, attn_factor, beta_fast, beta_slow
10027
10089
  );
10028
10090
  cb(Kcur, "Kcur", il);
@@ -10145,14 +10207,14 @@ struct llm_build_context {
10145
10207
 
10146
10208
  Qcur = ggml_rope_ext(
10147
10209
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10148
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10210
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10149
10211
  ext_factor, attn_factor, beta_fast, beta_slow
10150
10212
  );
10151
10213
  cb(Qcur, "Qcur", il);
10152
10214
 
10153
10215
  Kcur = ggml_rope_ext(
10154
10216
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10155
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10217
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10156
10218
  ext_factor, attn_factor, beta_fast, beta_slow
10157
10219
  );
10158
10220
  cb(Kcur, "Kcur", il);
@@ -10217,7 +10279,7 @@ struct llm_build_context {
10217
10279
  cb(cur, "lmhead_scaling", -1);
10218
10280
 
10219
10281
  // lm_head
10220
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10282
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10221
10283
  cb(cur, "result_output", -1);
10222
10284
 
10223
10285
  ggml_build_forward_expand(gf, cur);
@@ -10265,7 +10327,7 @@ struct llm_build_context {
10265
10327
 
10266
10328
  Qcur = ggml_rope_ext(
10267
10329
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10268
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10330
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10269
10331
  ext_factor, attn_factor, beta_fast, beta_slow);
10270
10332
  cb(Qcur, "Qcur", il);
10271
10333
 
@@ -10274,7 +10336,7 @@ struct llm_build_context {
10274
10336
 
10275
10337
  Kcur = ggml_rope_ext(
10276
10338
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10277
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10339
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10278
10340
  ext_factor, attn_factor, beta_fast, beta_slow);
10279
10341
  cb(Kcur, "Kcur", il);
10280
10342
 
@@ -10385,14 +10447,14 @@ struct llm_build_context {
10385
10447
 
10386
10448
  Qcur = ggml_rope_ext(
10387
10449
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10388
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10450
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10389
10451
  ext_factor, attn_factor, beta_fast, beta_slow
10390
10452
  );
10391
10453
  cb(Qcur, "Qcur", il);
10392
10454
 
10393
10455
  Kcur = ggml_rope_ext(
10394
10456
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10395
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10457
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10396
10458
  ext_factor, attn_factor, beta_fast, beta_slow
10397
10459
  );
10398
10460
  cb(Kcur, "Kcur", il);
@@ -10675,14 +10737,14 @@ struct llm_build_context {
10675
10737
 
10676
10738
  Qcur = ggml_rope_ext(
10677
10739
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10678
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10740
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10679
10741
  ext_factor, attn_factor, beta_fast, beta_slow
10680
10742
  );
10681
10743
  cb(Qcur, "Qcur", il);
10682
10744
 
10683
10745
  Kcur = ggml_rope_ext(
10684
10746
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10685
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10747
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10686
10748
  ext_factor, attn_factor, beta_fast, beta_slow
10687
10749
  );
10688
10750
  cb(Kcur, "Kcur", il);
@@ -10806,14 +10868,14 @@ struct llm_build_context {
10806
10868
 
10807
10869
  Qcur = ggml_rope_ext(
10808
10870
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10871
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10810
10872
  ext_factor, attn_factor, beta_fast, beta_slow
10811
10873
  );
10812
10874
  cb(Qcur, "Qcur", il);
10813
10875
 
10814
10876
  Kcur = ggml_rope_ext(
10815
10877
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10878
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10817
10879
  ext_factor, attn_factor, beta_fast, beta_slow
10818
10880
  );
10819
10881
  cb(Kcur, "Kcur", il);
@@ -10920,14 +10982,14 @@ struct llm_build_context {
10920
10982
 
10921
10983
  Qcur = ggml_rope_ext(
10922
10984
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10985
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10924
10986
  ext_factor, attn_factor, beta_fast, beta_slow
10925
10987
  );
10926
10988
  cb(Qcur, "Qcur", il);
10927
10989
 
10928
10990
  Kcur = ggml_rope_ext(
10929
10991
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10992
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10931
10993
  ext_factor, attn_factor, beta_fast, beta_slow
10932
10994
  );
10933
10995
  cb(Kcur, "Kcur", il);
@@ -11055,14 +11117,14 @@ struct llm_build_context {
11055
11117
 
11056
11118
  Qcur = ggml_rope_ext(
11057
11119
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
11058
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11120
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11059
11121
  ext_factor, attn_factor, beta_fast, beta_slow
11060
11122
  );
11061
11123
  cb(Qcur, "Qcur", il);
11062
11124
 
11063
11125
  Kcur = ggml_rope_ext(
11064
11126
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
11065
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11127
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11066
11128
  ext_factor, attn_factor, beta_fast, beta_slow
11067
11129
  );
11068
11130
  cb(Kcur, "Kcur", il);
@@ -11272,7 +11334,7 @@ struct llm_build_context {
11272
11334
  q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
11335
  q_pe = ggml_rope_ext(
11274
11336
  ctx0, q_pe, inp_pos, nullptr,
11275
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11337
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11276
11338
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
11339
  );
11278
11340
  cb(q_pe, "q_pe", il);
@@ -11281,7 +11343,7 @@ struct llm_build_context {
11281
11343
  k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
11344
  k_pe = ggml_rope_ext(
11283
11345
  ctx0, k_pe, inp_pos, nullptr,
11284
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11346
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11285
11347
  ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
11348
  );
11287
11349
  cb(k_pe, "k_pe", il);
@@ -12616,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12616
12678
 
12617
12679
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12618
12680
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12619
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12681
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12620
12682
  }
12621
12683
 
12622
12684
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12623
12685
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12624
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12686
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12625
12687
  }
12626
12688
 
12627
12689
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12628
12690
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12629
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12691
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12630
12692
  }
12631
12693
 
12632
12694
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12633
12695
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12634
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12696
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12635
12697
  }
12636
12698
 
12637
12699
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12638
12700
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12639
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12701
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12640
12702
  }
12641
12703
 
12642
12704
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -13254,7 +13316,8 @@ struct fragment_buffer_variant {
13254
13316
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
13255
13317
  // for each special token
13256
13318
  for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
- const auto & special_token = vocab.id_to_token[special_id].text;
13319
+ const auto & data = vocab.id_to_token[special_id];
13320
+ const auto & special_token = data.text;
13258
13321
 
13259
13322
  // for each text fragment
13260
13323
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13291,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13291
13354
  if (match > raw_text_base_offset) {
13292
13355
  // left
13293
13356
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
13294
- const int64_t left_reminder_length = match - raw_text_base_offset;
13295
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13357
+ int64_t left_reminder_length = match - raw_text_base_offset;
13358
+
13359
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13360
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13361
+ left_reminder_length--;
13362
+ }
13363
+ }
13364
+
13365
+ if (left_reminder_length > 0) {
13366
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13367
+ it++;
13368
+ }
13296
13369
 
13297
13370
  #ifdef PRETOKENIZERDEBUG
13298
13371
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
13299
13372
  #endif
13300
- it++;
13301
13373
  }
13302
13374
 
13303
13375
  // special token
@@ -13306,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13306
13378
 
13307
13379
  // right
13308
13380
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13309
- const int64_t right_reminder_offset = match + special_token.length();
13310
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13311
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13381
+ int64_t right_reminder_offset = match + special_token.length();
13382
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13383
+
13384
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13385
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13386
+ right_reminder_offset++;
13387
+ right_reminder_length--;
13388
+ }
13389
+ }
13390
+
13391
+ if (right_reminder_length > 0) {
13392
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13393
+ it++;
13394
+ }
13312
13395
 
13313
13396
  #ifdef PRETOKENIZERDEBUG
13314
13397
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13315
13398
  #endif
13316
13399
 
13317
- it++;
13318
-
13319
13400
  if (source == 0) {
13320
13401
  buffer.erase_after(buffer.before_begin());
13321
13402
  } else {
@@ -13361,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13361
13442
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13362
13443
  // tokenizer.encode('', add_special_tokens=False) returns []
13363
13444
 
13364
- static const bool rtrim = true; //TODO: as param
13365
13445
  bool is_prev_special = false;
13366
- bool special_token_rtrim = false;
13367
13446
 
13368
13447
  if (add_special && vocab.special_add_bos != 0) {
13369
13448
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13373,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13373
13452
 
13374
13453
  for (const auto & fragment : fragment_buffer) {
13375
13454
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13376
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13377
-
13378
- // TODO: It's likely possible to get rid of this string copy entirely
13379
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13380
- // and passing 'add space prefix' as bool argument
13381
- //
13382
13455
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13383
13456
 
13384
- if (special_token_rtrim) {
13385
- size_t num_whitespaces = 0;
13386
- while (isspace(raw_text[num_whitespaces])) {
13387
- num_whitespaces++;
13388
- }
13389
- if (num_whitespaces == raw_text.size()) {
13390
- continue; // skip if all whitespaces
13391
- }
13392
- raw_text = raw_text.substr(num_whitespaces);
13393
- }
13394
-
13395
13457
  if (vocab.add_space_prefix) {
13396
13458
  if (!output.size() || is_prev_special) { // prefix with space if first token
13397
13459
  raw_text = " " + raw_text;
@@ -13407,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13407
13469
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13408
13470
  output.push_back(fragment.token);
13409
13471
  is_prev_special = true;
13410
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
- special_token_rtrim = rtrim
13412
- && fragment.token != vocab.special_bos_id
13413
- && fragment.token != vocab.special_unk_id
13414
- && fragment.token != vocab.special_eos_id;
13415
13472
  }
13416
13473
  }
13417
13474
 
@@ -14646,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14646
14703
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14647
14704
  }
14648
14705
 
14649
- //
14650
- // Beam search
14651
- //
14652
-
14653
- struct llama_beam {
14654
- std::vector<llama_token> tokens;
14655
- float p; // Cumulative beam probability (renormalized relative to all beams)
14656
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14657
- // Sort beams by probability. In case of ties, prefer beams at eob.
14658
- bool operator<(const llama_beam & rhs) const {
14659
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14660
- }
14661
- // Shift off first n tokens and discard them.
14662
- void shift_tokens(const size_t n) {
14663
- if (n) {
14664
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14665
- tokens.resize(tokens.size() - n);
14666
- }
14667
- }
14668
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14669
- };
14670
-
14671
- // A struct for calculating logit-related info.
14672
- struct llama_logit_info {
14673
- const float * const logits;
14674
- const int n_vocab;
14675
- const float max_l;
14676
- const float normalizer;
14677
- struct sum_exp {
14678
- float max_l;
14679
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14680
- };
14681
- llama_logit_info(llama_context * ctx)
14682
- : logits(llama_get_logits(ctx))
14683
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14684
- , max_l(*std::max_element(logits, logits + n_vocab))
14685
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14686
- { }
14687
- llama_token_data get_token_data(const llama_token token_id) const {
14688
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14689
- return {token_id, logits[token_id], p};
14690
- }
14691
- // Return top k token_data by logit.
14692
- std::vector<llama_token_data> top_k(size_t k) {
14693
- std::vector<llama_token_data> min_heap; // min-heap by logit
14694
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14695
- min_heap.reserve(k_min);
14696
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14697
- min_heap.push_back(get_token_data(token_id));
14698
- }
14699
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14700
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14701
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14702
- if (min_heap.front().logit < logits[token_id]) {
14703
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14704
- min_heap.back().id = token_id;
14705
- min_heap.back().logit = logits[token_id];
14706
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14707
- }
14708
- }
14709
- return min_heap;
14710
- }
14711
- float probability_from_logit(float logit) const {
14712
- return normalizer * std::exp(logit - max_l);
14713
- }
14714
- };
14715
-
14716
- struct llama_beam_search_data {
14717
- llama_context * ctx;
14718
- size_t n_beams;
14719
- int n_past;
14720
- int n_predict;
14721
- std::vector<llama_beam> beams;
14722
- std::vector<llama_beam> next_beams;
14723
-
14724
- // Re-calculated on each loop iteration
14725
- size_t common_prefix_length;
14726
-
14727
- // Used to communicate to/from callback on beams state.
14728
- std::vector<llama_beam_view> beam_views;
14729
-
14730
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14731
- : ctx(ctx)
14732
- , n_beams(n_beams)
14733
- , n_past(n_past)
14734
- , n_predict(n_predict)
14735
- , beam_views(n_beams) {
14736
- beams.reserve(n_beams);
14737
- next_beams.reserve(n_beams);
14738
- }
14739
-
14740
- // Collapse beams to a single beam given by index.
14741
- void collapse_beams(const size_t beam_idx) {
14742
- if (0u < beam_idx) {
14743
- std::swap(beams[0], beams[beam_idx]);
14744
- }
14745
- beams.resize(1);
14746
- }
14747
-
14748
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14749
- // The repetitive patterns below reflect the 2 stages of heaps:
14750
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14751
- // * If the heap is full and a new element is found that should be included, pop the
14752
- // least element to the back(), replace it with the new, then push it into the heap.
14753
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14754
- // Min-heaps use a greater-than comparator.
14755
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14756
- if (beam.eob) {
14757
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14758
- if (next_beams.size() < n_beams) {
14759
- next_beams.push_back(std::move(beam));
14760
- if (next_beams.size() == n_beams) {
14761
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14762
- }
14763
- } else if (next_beams.front().p < beam.p) {
14764
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14765
- next_beams.back() = std::move(beam);
14766
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14767
- }
14768
- } else {
14769
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14770
- if (!beam.tokens.empty()) {
14771
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14772
- }
14773
- llama_logit_info logit_info(ctx);
14774
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14775
-
14776
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14777
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14778
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14779
-
14780
- size_t i=0;
14781
- if (next_beams.size() < n_beams) {
14782
- for (; next_beams.size() < n_beams ; ++i) {
14783
- llama_beam next_beam = beam;
14784
- next_beam.tokens.push_back(next_tokens[i].id);
14785
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14786
- next_beams.push_back(std::move(next_beam));
14787
- }
14788
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14789
- } else {
14790
- for (; next_beams.front().p == 0.0f ; ++i) {
14791
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14792
- next_beams.back() = beam;
14793
- next_beams.back().tokens.push_back(next_tokens[i].id);
14794
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14795
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14796
- }
14797
- }
14798
- for (; i < n_beams ; ++i) {
14799
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14800
- if (next_beams.front().p < next_p) {
14801
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14802
- next_beams.back() = beam;
14803
- next_beams.back().tokens.push_back(next_tokens[i].id);
14804
- next_beams.back().p = next_p;
14805
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14806
- }
14807
- }
14808
- }
14809
- }
14810
-
14811
- // Find common_prefix_length based on beams.
14812
- // Requires beams is not empty.
14813
- size_t find_common_prefix_length() {
14814
- size_t common_prefix_length = beams[0].tokens.size();
14815
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14816
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14817
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14818
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14819
- common_prefix_length = j;
14820
- break;
14821
- }
14822
- }
14823
- }
14824
- return common_prefix_length;
14825
- }
14826
-
14827
- // Construct beams_state to send back to caller via the callback function.
14828
- // Side effect: set common_prefix_length = find_common_prefix_length();
14829
- llama_beams_state get_beams_state(const bool last_call) {
14830
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14831
- beam_views[i] = beams[i].view();
14832
- }
14833
- common_prefix_length = find_common_prefix_length();
14834
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14835
- }
14836
-
14837
- // Loop:
14838
- // * while i < n_predict, AND
14839
- // * any of the beams have not yet reached end-of-beam (eob), AND
14840
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14841
- // (since all other beam probabilities can only decrease)
14842
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14843
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14844
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14845
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14846
- !beams[top_beam_index()].eob ; ++i) {
14847
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14848
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14849
- if (common_prefix_length) {
14850
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14851
- n_past += common_prefix_length;
14852
- }
14853
- // Zero-out next_beam probabilities to place them last in following min-heap.
14854
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14855
- for (llama_beam & beam : beams) {
14856
- beam.shift_tokens(common_prefix_length);
14857
- fill_next_beams_by_top_probabilities(beam);
14858
- }
14859
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14860
- beams.swap(next_beams);
14861
- renormalize_beam_probabilities(beams);
14862
- }
14863
- collapse_beams(top_beam_index());
14864
- callback(callback_data, get_beams_state(true));
14865
- }
14866
-
14867
- // As beams grow, the cumulative probabilities decrease.
14868
- // Renormalize them to avoid floating point underflow.
14869
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14870
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14871
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14872
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14873
- }
14874
-
14875
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14876
- size_t top_beam_index() {
14877
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14878
- }
14879
-
14880
- // Copy (p,eob) for each beam which may have been changed by the callback.
14881
- void update_beams_from_beam_views() {
14882
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14883
- beams[i].p = beam_views[i].p;
14884
- beams[i].eob = beam_views[i].eob;
14885
- }
14886
- }
14887
- };
14888
-
14889
- void llama_beam_search(llama_context * ctx,
14890
- llama_beam_search_callback_fn_t callback, void * callback_data,
14891
- size_t n_beams, int n_past, int n_predict) {
14892
- assert(ctx);
14893
- const int64_t t_start_sample_us = ggml_time_us();
14894
-
14895
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14896
-
14897
- beam_search_data.loop(callback, callback_data);
14898
-
14899
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14900
- ctx->n_sample++;
14901
- }
14902
-
14903
14706
  //
14904
14707
  // quantization
14905
14708
  //
@@ -16110,7 +15913,7 @@ bool llama_supports_mlock(void) {
16110
15913
  }
16111
15914
 
16112
15915
  bool llama_supports_gpu_offload(void) {
16113
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15916
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
16114
15917
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
16115
15918
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
16116
15919
  return true;
@@ -16167,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
16167
15970
  return true;
16168
15971
  };
16169
15972
  }
16170
- if (params.rpc_servers != nullptr) {
15973
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
16171
15974
  // split the servers set them into model->rpc_servers
16172
15975
  std::string servers(params.rpc_servers);
16173
15976
  size_t pos = 0;
@@ -16221,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
16221
16024
  params.flash_attn = false;
16222
16025
  }
16223
16026
 
16027
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
+ return nullptr;
16030
+ }
16031
+
16224
16032
  llama_context * ctx = new llama_context(*model);
16225
16033
 
16226
16034
  const auto & hparams = model->hparams;
@@ -16259,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
16259
16067
 
16260
16068
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
16261
16069
 
16262
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16263
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16070
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16071
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
16264
16072
  hparams.n_ctx_train;
16265
16073
 
16266
16074
  cparams.cb_eval = params.cb_eval;
@@ -16325,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
16325
16133
 
16326
16134
  if (!hparams.vocab_only) {
16327
16135
  // initialize backends
16328
- #if defined(GGML_USE_RPC)
16329
- for (auto & server : model->rpc_servers) {
16330
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16331
- if (backend == nullptr) {
16332
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16333
- llama_free(ctx);
16334
- return nullptr;
16335
- }
16336
- ctx->backends.push_back(backend);
16337
- }
16338
- #elif defined(GGML_USE_METAL)
16136
+ #if defined(GGML_USE_METAL)
16339
16137
  if (model->n_gpu_layers > 0) {
16340
16138
  ctx->backend_metal = ggml_backend_metal_init();
16341
16139
  if (ctx->backend_metal == nullptr) {
@@ -16374,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
16374
16172
  return nullptr;
16375
16173
  }
16376
16174
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16377
- ggml_backend_t backend = ggml_backend_vk_init(0);
16175
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16378
16176
  if (backend == nullptr) {
16379
16177
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16380
16178
  llama_free(ctx);
@@ -16427,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
16427
16225
  }
16428
16226
  ctx->backends.push_back(backend);
16429
16227
  }
16228
+ #endif
16229
+ #if defined(GGML_USE_RPC)
16230
+ if (model->n_gpu_layers > 0) {
16231
+ for (const auto & endpoint : model->rpc_servers) {
16232
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16233
+ if (backend == nullptr) {
16234
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16235
+ llama_free(ctx);
16236
+ return nullptr;
16237
+ }
16238
+ ctx->backends.push_back(backend);
16239
+ }
16240
+ }
16430
16241
  #endif
16431
16242
  ctx->backend_cpu = ggml_backend_cpu_init();
16432
16243
  if (ctx->backend_cpu == nullptr) {
@@ -18209,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
18209
18020
  return model->vocab.id_to_token[token].score;
18210
18021
  }
18211
18022
 
18212
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18023
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
18213
18024
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
18214
- return model->vocab.id_to_token[token].type;
18025
+ return model->vocab.id_to_token[token].attr;
18215
18026
  }
18216
18027
 
18217
18028
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -18313,9 +18124,14 @@ static std::string llama_decode_text(const std::string & text) {
18313
18124
 
18314
18125
  // does not write null-terminator to buf
18315
18126
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18127
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18128
+ if (!special && llama_is_control_token(model->vocab, token)) {
18129
+ return 0;
18130
+ }
18131
+
18316
18132
  // if we have a cache - use it
18317
18133
  {
18318
- const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18134
+ const auto & cache = model->vocab.cache_token_to_piece;
18319
18135
 
18320
18136
  if (!cache.empty()) {
18321
18137
  const auto & res = cache.at(token);