whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -39,6 +39,7 @@ enum llm_type {
39
39
  LLM_TYPE_475M,
40
40
  LLM_TYPE_770M,
41
41
  LLM_TYPE_780M,
42
+ LLM_TYPE_0_3B,
42
43
  LLM_TYPE_0_5B,
43
44
  LLM_TYPE_0_6B,
44
45
  LLM_TYPE_1B,
@@ -73,6 +74,7 @@ enum llm_type {
73
74
  LLM_TYPE_40B,
74
75
  LLM_TYPE_65B,
75
76
  LLM_TYPE_70B,
77
+ LLM_TYPE_142B,
76
78
  LLM_TYPE_236B,
77
79
  LLM_TYPE_290B,
78
80
  LLM_TYPE_314B,
@@ -94,6 +96,8 @@ enum llm_type {
94
96
  LLM_TYPE_17B_128E, // llama4 Maverick
95
97
  LLM_TYPE_30B_A3B,
96
98
  LLM_TYPE_235B_A22B,
99
+ LLM_TYPE_E2B,
100
+ LLM_TYPE_E4B,
97
101
  };
98
102
 
99
103
  std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -315,6 +319,19 @@ struct llama_layer {
315
319
  struct ggml_tensor * ffn_up_scale = nullptr;
316
320
  struct ggml_tensor * ffn_down_scale = nullptr;
317
321
 
322
+ // altup & laurel
323
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
324
+ struct ggml_tensor * per_layer_proj = nullptr;
325
+ struct ggml_tensor * per_layer_post_norm = nullptr;
326
+ struct ggml_tensor * altup_correct_coef = nullptr;
327
+ struct ggml_tensor * altup_correct_scale = nullptr;
328
+ struct ggml_tensor * altup_predict_coef = nullptr;
329
+ struct ggml_tensor * altup_router = nullptr;
330
+ struct ggml_tensor * altup_router_norm = nullptr;
331
+ struct ggml_tensor * laurel_l = nullptr;
332
+ struct ggml_tensor * laurel_r = nullptr;
333
+ struct ggml_tensor * laurel_post_norm = nullptr;
334
+
318
335
  struct llama_layer_posnet posnet;
319
336
 
320
337
  struct llama_layer_convnext convnext;
@@ -329,6 +346,9 @@ struct llama_model {
329
346
  llama_hparams hparams = {};
330
347
  llama_vocab vocab;
331
348
 
349
+ // for classifier models
350
+ std::vector<std::string> classifier_labels;
351
+
332
352
  struct ggml_tensor * tok_embd = nullptr;
333
353
  struct ggml_tensor * type_embd = nullptr;
334
354
  struct ggml_tensor * pos_embd = nullptr;
@@ -350,6 +370,13 @@ struct llama_model {
350
370
  struct ggml_tensor * conv1d = nullptr;
351
371
  struct ggml_tensor * conv1d_b = nullptr;
352
372
 
373
+ // gemma3n altup
374
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
375
+ struct ggml_tensor * altup_proj = nullptr;
376
+ struct ggml_tensor * altup_unembd_proj = nullptr;
377
+ struct ggml_tensor * per_layer_model_proj = nullptr;
378
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
379
+
353
380
  std::vector<llama_layer> layers;
354
381
 
355
382
  llama_model_params params;
@@ -1,5 +1,4 @@
1
1
  #include "llama-quant.h"
2
-
3
2
  #include "llama-impl.h"
4
3
  #include "llama-model.h"
5
4
  #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
27
26
  }
28
27
  }
29
28
 
29
+ static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30
+ if (prune.empty()) {
31
+ return orig_name;
32
+ }
33
+
34
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
35
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
36
+ const int blk = std::stoi(match[1]);
37
+ std::string new_name = orig_name;
38
+
39
+ if (mapped.count(blk)) {
40
+ // Already mapped, do nothing
41
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
42
+ mapped[blk] = "";
43
+ } else if (blk < prune.front()) {
44
+ mapped[blk] = std::to_string(blk);
45
+ next_id = blk + 1;
46
+ } else {
47
+ mapped[blk] = std::to_string(next_id);
48
+ ++next_id;
49
+ }
50
+
51
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
52
+ }
53
+
54
+ return orig_name;
55
+ }
56
+
57
+ static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
+ if (mapped.empty()) {
59
+ return orig_name;
60
+ }
61
+
62
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
63
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
64
+ const std::string blk(match[1]);
65
+ std::string new_name = orig_name;
66
+
67
+ for (const auto & p : mapped) {
68
+ if (p.second == blk) {
69
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
71
+ }
72
+ }
73
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74
+ }
75
+
76
+ return orig_name;
77
+ }
78
+
30
79
  struct quantize_state_impl {
31
80
  const llama_model & model;
32
81
  const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
174
223
  new_type = GGML_TYPE_Q6_K;
175
224
  }
176
225
  }
177
- } else if (name == "token_embd.weight") {
226
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
178
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
179
228
  new_type = qs.params->token_embedding_type;
180
229
  } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
568
617
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
569
618
  gguf_context_ptr ctx_out { gguf_init_empty() };
570
619
 
620
+ std::vector<int> prune_list = {};
621
+ if (params->prune_layers) {
622
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
623
+ }
624
+
571
625
  // copy the KV pairs from the input file
572
626
  gguf_set_kv (ctx_out.get(), ml.meta.get());
573
627
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -585,7 +639,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
585
639
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
586
640
  gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
587
641
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
588
- gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
642
+ // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
643
+ gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
589
644
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
590
645
  gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
591
646
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
@@ -596,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
651
  }
597
652
  }
598
653
 
654
+ std::map<int, std::string> mapped;
655
+ int blk_id = 0;
656
+ int pruned_attention_w = 0;
657
+
599
658
  // make a list of weights
600
659
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
601
660
  tensors.reserve(ml.weights_map.size());
602
661
  for (const auto & it : ml.weights_map) {
662
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
663
+ if (remapped_name.empty()) {
664
+ if (it.first.find("attn_v.weight") != std::string::npos ||
665
+ it.first.find("attn_qkv.weight") != std::string::npos ||
666
+ it.first.find("attn_kv_b.weight") != std::string::npos) {
667
+ pruned_attention_w++;
668
+ }
669
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
670
+ continue;
671
+ } else if (remapped_name != it.first) {
672
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
673
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
674
+ }
603
675
  tensors.push_back(&it.second);
604
676
  }
677
+ if (!prune_list.empty()) {
678
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
679
+ }
605
680
 
606
681
  // keep_split requires that the weights are sorted by split index
607
682
  if (params->keep_split) {
@@ -639,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
639
714
  if (llama_model_has_encoder(&model)) {
640
715
  n_attn_layer *= 3;
641
716
  }
642
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
717
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
643
718
  }
644
719
 
645
720
  size_t total_size_org = 0;
@@ -680,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
680
755
  for (size_t i = 0; i < ctx_outs.size(); ++i) {
681
756
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
682
757
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
683
- gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
758
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
684
759
  }
685
760
  }
686
761
 
@@ -755,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
755
830
  // NOTE: can't use LLM_TN here because the layer number is not known
756
831
  quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
757
832
 
833
+ // these are very small (e.g. 4x4)
834
+ quantize &= name.find("altup") == std::string::npos;
835
+ quantize &= name.find("laurel") == std::string::npos;
836
+
837
+ // these are not too big so keep them as it is
838
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
839
+
758
840
  // do not quantize positional embeddings and token types (BERT)
759
841
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
760
842
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -831,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
831
913
 
832
914
  const float * imatrix = nullptr;
833
915
  if (imatrix_data) {
834
- auto it = imatrix_data->find(tensor->name);
916
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
835
917
  if (it == imatrix_data->end()) {
836
918
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
837
919
  } else {
@@ -946,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
946
1028
  /*.imatrix =*/ nullptr,
947
1029
  /*.kv_overrides =*/ nullptr,
948
1030
  /*.tensor_type =*/ nullptr,
1031
+ /*.prune_layers =*/ nullptr
949
1032
  };
950
1033
 
951
1034
  return result;
@@ -9,16 +9,16 @@
9
9
 
10
10
  #include <algorithm>
11
11
  #include <cassert>
12
+ #include <cctype>
12
13
  #include <cfloat>
13
- #include <climits>
14
14
  #include <cstdarg>
15
15
  #include <cstring>
16
16
  #include <forward_list>
17
+ #include <limits>
17
18
  #include <map>
18
19
  #include <queue>
19
20
  #include <set>
20
21
  #include <unordered_map>
21
- #include <cctype>
22
22
 
23
23
  //
24
24
  // helpers
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
1269
1269
  bool add_space_prefix = false;
1270
1270
  bool add_bos = false;
1271
1271
  bool add_eos = false;
1272
+ bool add_sep = false;
1272
1273
  bool ignore_merges = false;
1273
1274
  bool clean_spaces = false; // clean_up_tokenization_spaces
1274
1275
  bool remove_extra_whitespaces = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1421
1422
  special_sep_id = 102;
1422
1423
  special_pad_id = 0;
1423
1424
  special_mask_id = 103;
1425
+
1426
+ add_sep = true;
1424
1427
  } else if (tokenizer_model == "gpt2") {
1425
1428
  type = LLAMA_VOCAB_TYPE_BPE;
1426
1429
 
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1550
1553
  tokenizer_pre == "jina-es" ||
1551
1554
  tokenizer_pre == "jina-de" ||
1552
1555
  tokenizer_pre == "gigachat" ||
1553
- tokenizer_pre == "jina-v1-en" ||
1554
1556
  tokenizer_pre == "jina-v2-es" ||
1555
- tokenizer_pre == "jina-v2-de" ||
1557
+ tokenizer_pre == "jina-v2-de") {
1558
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559
+ } else if (
1560
+ tokenizer_pre == "jina-v1-en" ||
1556
1561
  tokenizer_pre == "jina-v2-code" ||
1557
1562
  tokenizer_pre == "roberta-bpe") {
1558
1563
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1564
+ add_sep = true;
1559
1565
  } else if (
1560
1566
  tokenizer_pre == "refact") {
1561
1567
  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1665
1671
  clean_spaces = true;
1666
1672
  add_bos = true;
1667
1673
  add_eos = false;
1674
+ add_sep = true;
1668
1675
  } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1669
1676
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1670
1677
  add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1801
1808
  }
1802
1809
  }
1803
1810
 
1804
- // Handle add_bos and add_eos
1811
+ // Handle add_bos, add_eos and add_sep
1805
1812
  {
1806
1813
  bool temp = true;
1807
1814
 
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1818
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1812
1819
  add_eos = temp;
1813
1820
  }
1821
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
1822
+ add_sep = temp;
1823
+ }
1814
1824
  }
1815
1825
 
1816
1826
  // auto-detect special tokens by text
@@ -1987,6 +1997,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1987
1997
  || t.first == "<|eom_id|>"
1988
1998
  || t.first == "<EOT>"
1989
1999
  || t.first == "_<EOT>"
2000
+ || t.first == "<|end_of_text|>"
1990
2001
  ) {
1991
2002
  special_eog_ids.insert(t.second);
1992
2003
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2059,9 +2070,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2059
2070
  //NOTE: Per token attributes are missing from the GGUF file.
2060
2071
  //TODO: Extract attributes from GGUF file.
2061
2072
  {
2062
- auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
2073
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2063
2074
  for (const auto & substr : substrs) {
2064
- if (str.find(substr) < std::string::npos) {
2075
+ if (str.find(substr) != std::string::npos) {
2065
2076
  return true;
2066
2077
  }
2067
2078
  }
@@ -2080,9 +2091,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2080
2091
 
2081
2092
  std::string model_name;
2082
2093
  std::string tokenizer_pre;
2094
+ std::string general_arch;
2083
2095
 
2084
2096
  ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2085
2097
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2098
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2086
2099
 
2087
2100
  // model name to lowercase
2088
2101
  std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,9 +2104,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2091
2104
  }
2092
2105
  );
2093
2106
 
2094
- // set attributes by model/tokenizer name
2095
- if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2096
- _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2107
+ // set attributes by model/tokenizer/architecture name
2108
+ if (false
2109
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2110
+ || _contains_any(general_arch, {"nomic-bert-moe"})
2111
+ ) {
2112
+ if (token_to_id.count("<mask>") == 0) {
2113
+ LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2114
+ } else {
2115
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2116
+ }
2097
2117
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2098
2118
  for (auto id : cache_special_tokens) {
2099
2119
  _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
@@ -2563,6 +2583,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2563
2583
  // copy piece chars to output text buffer
2564
2584
  // skip up to 'lstrip' leading spaces before copying
2565
2585
  auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2586
+ if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2587
+ GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2588
+ }
2589
+
2566
2590
  for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
2567
2591
  token++;
2568
2592
  size--;
@@ -2759,26 +2783,26 @@ void llama_vocab::impl::print_info() const {
2759
2783
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
2760
2784
 
2761
2785
  // special tokens
2762
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
2763
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
2764
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
2765
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
2766
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
2767
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
2768
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
2769
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
2770
-
2771
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
2772
-
2773
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
2774
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
2775
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
2776
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
2777
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
2778
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
2786
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
2787
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
2788
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
2789
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
2790
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
2791
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
2792
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
2793
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
2794
+
2795
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
2796
+
2797
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
2798
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
2799
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
2800
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
2801
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
2802
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
2779
2803
 
2780
2804
  for (const auto & id : special_eog_ids) {
2781
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
2805
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
2782
2806
  }
2783
2807
 
2784
2808
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
@@ -2986,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
2986
3010
  return pimpl->add_eos;
2987
3011
  }
2988
3012
 
3013
+ bool llama_vocab::get_add_sep() const {
3014
+ return pimpl->add_sep;
3015
+ }
3016
+
2989
3017
  bool llama_vocab::get_ignore_merges() const {
2990
3018
  return pimpl->ignore_merges;
2991
3019
  }
@@ -3046,6 +3074,11 @@ int32_t llama_vocab::tokenize(
3046
3074
  bool add_special,
3047
3075
  bool parse_special) const {
3048
3076
  auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3077
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3078
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3079
+ return std::numeric_limits<int32_t>::min();
3080
+ }
3081
+
3049
3082
  if (n_tokens_max < (int) res.size()) {
3050
3083
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3051
3084
  return -((int) res.size());
@@ -3177,6 +3210,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3177
3210
  return vocab->get_add_eos();
3178
3211
  }
3179
3212
 
3213
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3214
+ return vocab->get_add_sep();
3215
+ }
3216
+
3180
3217
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3181
3218
  return vocab->token_fim_pre();
3182
3219
  }
@@ -74,6 +74,7 @@ struct llama_vocab {
74
74
  bool get_add_space_prefix () const;
75
75
  bool get_add_bos () const;
76
76
  bool get_add_eos () const;
77
+ bool get_add_sep () const;
77
78
  bool get_ignore_merges () const;
78
79
  bool get_clean_spaces () const;
79
80
  bool get_remove_extra_whitespaces () const;
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
198
198
 
199
199
  // if using single GPU mode, remove all except the main GPU
200
200
  if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
201
- if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
202
- LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
203
- llama_model_free(model);
204
- return nullptr;
201
+ if (params.main_gpu < 0) {
202
+ model->devices.clear();
203
+ } else {
204
+ if (params.main_gpu >= (int)model->devices.size()) {
205
+ LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
206
+ llama_model_free(model);
207
+ return nullptr;
208
+ }
209
+ ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
210
+ model->devices.clear();
211
+ model->devices.push_back(main_gpu);
205
212
  }
206
- ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
207
- model->devices.clear();
208
- model->devices.push_back(main_gpu);
209
213
  }
210
214
 
211
215
  for (auto * dev : model->devices) {