whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -1,3900 +0,0 @@
1
- #include "llama-vocab.h"
2
-
3
- #include "ggml.h"
4
- #include "gguf.h"
5
- #include "llama-impl.h"
6
- #include "llama-model-loader.h"
7
-
8
- #include "unicode.h"
9
-
10
- #include <algorithm>
11
- #include <cassert>
12
- #include <cctype>
13
- #include <cfloat>
14
- #include <cmath>
15
- #include <cstdarg>
16
- #include <cstring>
17
- #include <forward_list>
18
- #include <limits>
19
- #include <map>
20
- #include <queue>
21
- #include <set>
22
- #include <unordered_map>
23
-
24
- //
25
- // helpers
26
- //
27
-
28
- struct naive_trie {
29
- naive_trie() : has_value(false), value(0) {
30
- }
31
- void insert(const char * key, size_t len, int32_t value = 0) {
32
- if (len == 0) {
33
- this->has_value = true;
34
- this->value = value;
35
- return;
36
- }
37
- char c = key[0];
38
- auto res = children.find(c);
39
- if (res != children.end()) {
40
- res->second.insert(key + 1, len - 1, value);
41
- } else {
42
- auto res = children.insert(std::make_pair(c, naive_trie()));
43
- res.first->second.insert(key + 1, len - 1, value);
44
- }
45
- }
46
- std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
47
- if (len == 0 || offset == len) {
48
- return std::make_pair(key, offset);
49
- }
50
- char c = key[offset];
51
- auto res = children.find(c);
52
- if (res != children.end()) {
53
- return res->second.get_longest_prefix(key, len, offset + 1);
54
- }
55
-
56
- return std::make_pair(key, offset);
57
- }
58
- const struct naive_trie * traverse(const char c) const {
59
- auto res = children.find(c);
60
- if (res != children.end()) {
61
- return &res->second;
62
- }
63
-
64
- return NULL;
65
- }
66
- std::map<char, struct naive_trie> children;
67
- bool has_value;
68
- llama_token value;
69
- };
70
-
71
- //
72
- // tokenizers
73
- //
74
-
75
- struct llm_tokenizer {
76
- llm_tokenizer() {}
77
- virtual ~llm_tokenizer() = default;
78
- };
79
-
80
- struct llm_symbol {
81
- using index = int;
82
- index prev;
83
- index next;
84
- const char * text;
85
- size_t n;
86
- };
87
-
88
- static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
89
-
90
- //
91
- // SPM tokenizer
92
- // original implementation:
93
- // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
94
- //
95
-
96
- struct llm_bigram_spm {
97
- struct comparator {
98
- bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
99
- return (l.score < r.score) || (l.score == r.score && l.left > r.left);
100
- }
101
- };
102
- using queue_storage = std::vector<llm_bigram_spm>;
103
- using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
104
- llm_symbol::index left;
105
- llm_symbol::index right;
106
- float score;
107
- size_t size;
108
- };
109
-
110
- struct llm_tokenizer_spm : llm_tokenizer {
111
- llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
112
- };
113
-
114
- struct llm_tokenizer_spm_session {
115
- llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
116
-
117
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
118
- // split string into utf8 chars
119
- int index = 0;
120
- size_t offs = 0;
121
- while (offs < text.size()) {
122
- llm_symbol sym;
123
- size_t len = unicode_len_utf8(text[offs]);
124
- sym.text = text.c_str() + offs;
125
- sym.n = std::min(len, text.size() - offs);
126
- offs += sym.n;
127
- sym.prev = index - 1;
128
- sym.next = offs == text.size() ? -1 : index + 1;
129
- index++;
130
- symbols.emplace_back(sym);
131
- }
132
-
133
- // seed the work queue with all possible 2-character tokens.
134
- for (int i = 1; i < (int) symbols.size(); ++i) {
135
- try_add_bigram(i - 1, i);
136
- }
137
-
138
- // keep substituting the highest frequency pairs for as long as we can.
139
- while (!work_queue.empty()) {
140
- auto bigram = work_queue.top();
141
- work_queue.pop();
142
-
143
- auto & left_sym = symbols[bigram.left];
144
- auto & right_sym = symbols[bigram.right];
145
-
146
- // if one of the symbols already got merged, skip it.
147
- if (left_sym.n == 0 || right_sym.n == 0 ||
148
- left_sym.n + right_sym.n != bigram.size) {
149
- continue;
150
- }
151
-
152
- // merge the right sym into the left one
153
- left_sym.n += right_sym.n;
154
- right_sym.n = 0;
155
-
156
- //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
157
-
158
- // remove the right sym from the chain
159
- left_sym.next = right_sym.next;
160
- if (right_sym.next >= 0) {
161
- symbols[right_sym.next].prev = bigram.left;
162
- }
163
-
164
- // find more substitutions
165
- try_add_bigram(left_sym.prev, bigram.left);
166
- try_add_bigram(bigram.left, left_sym.next);
167
- }
168
-
169
- for (int i = 0; i != -1; i = symbols[i].next) {
170
- auto & symbol = symbols[i];
171
- resegment(symbol, output);
172
- }
173
- }
174
-
175
- private:
176
- void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
177
- auto text = std::string(symbol.text, symbol.n);
178
- auto token = vocab.text_to_token(text);
179
-
180
- // Do we need to support is_unused?
181
- if (token != LLAMA_TOKEN_NULL) {
182
- output.push_back(token);
183
- return;
184
- }
185
-
186
- const auto p = rev_merge.find(text);
187
-
188
- if (p == rev_merge.end()) {
189
- // output any symbols that did not form tokens as bytes.
190
- output.reserve(output.size() + symbol.n);
191
- for (int j = 0; j < (int)symbol.n; ++j) {
192
- llama_token id = vocab.byte_to_token(symbol.text[j]);
193
- output.push_back(id);
194
- }
195
- return;
196
- }
197
-
198
- resegment(symbols[p->second.first], output);
199
- resegment(symbols[p->second.second], output);
200
- }
201
-
202
- void try_add_bigram(int left, int right) {
203
- if (left == -1 || right == -1) {
204
- return;
205
- }
206
- const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
207
- auto token = vocab.text_to_token(text);
208
-
209
- if (token == LLAMA_TOKEN_NULL) {
210
- return;
211
- }
212
-
213
- if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
214
- return;
215
- }
216
-
217
- const auto & tok_data = vocab.get_token_data(token);
218
-
219
- llm_bigram_spm bigram;
220
- bigram.left = left;
221
- bigram.right = right;
222
- bigram.score = tok_data.score;
223
- bigram.size = text.size();
224
-
225
- work_queue.push(bigram);
226
-
227
- // Do we need to support is_unused?
228
- rev_merge[text] = std::make_pair(left, right);
229
- }
230
-
231
- const llama_vocab & vocab;
232
- // currently unused
233
- // const llm_tokenizer_spm * spm_tokenizer;
234
-
235
- std::vector<llm_symbol> symbols;
236
- llm_bigram_spm::queue work_queue;
237
- std::map<std::string, std::pair<int, int>> rev_merge;
238
- };
239
-
240
- //
241
- // BPE tokenizer
242
- // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
243
- // tried to simplify unicode stuff, so most likely does not work 100% correctly!
244
- //
245
-
246
- // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
247
-
248
- template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
249
- class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
250
- public:
251
- using std::priority_queue<T, Container, Compare>::priority_queue;
252
-
253
- T pop_move() {
254
- T item = std::move(this->c.front());
255
- std::pop_heap(this->c.begin(), this->c.end(), this->comp);
256
- this->c.pop_back();
257
- return item;
258
- }
259
-
260
- void pop() = delete;
261
- };
262
-
263
- struct llm_bigram_bpe {
264
- struct comparator {
265
- bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
266
- return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
267
- }
268
- };
269
-
270
- using queue_storage = std::vector<llm_bigram_bpe>;
271
- using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
272
- llm_symbol::index left;
273
- llm_symbol::index right;
274
- std::string text;
275
- int rank;
276
- size_t size;
277
- };
278
-
279
- struct llm_tokenizer_bpe : llm_tokenizer {
280
- llm_tokenizer_bpe(const llama_vocab & vocab) {
281
- GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
282
- switch (vocab.get_pre_type()) {
283
- case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
284
- regex_exprs = {
285
- // original regex from tokenizer.json
286
- //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
287
-
288
- // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
289
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
290
- };
291
- break;
292
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
293
- case LLAMA_VOCAB_PRE_TYPE_SMAUG:
294
- regex_exprs = {
295
- // same as llama3
296
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
297
- };
298
- break;
299
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
300
- regex_exprs = {
301
- "[\r\n]",
302
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
303
- "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
304
- "\\s+$",
305
- "[一-龥ࠀ-一가-퟿]+",
306
- "\\p{N}+",
307
- };
308
- break;
309
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
- case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311
- regex_exprs = {
312
- "\\p{N}{1,3}",
313
- "[一-龥぀-ゟ゠-ヿ]+",
314
- "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315
- };
316
- break;
317
- case LLAMA_VOCAB_PRE_TYPE_YOUTU:
318
- regex_exprs = {
319
- "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
320
- "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
321
- };
322
- break;
323
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
324
- regex_exprs = {
325
- "[\r\n]",
326
- "\\s?\\p{L}+",
327
- "\\s?\\p{P}+",
328
- "[一-龥ࠀ-一가-퟿]+",
329
- "\\p{N}",
330
- };
331
- break;
332
- case LLAMA_VOCAB_PRE_TYPE_FALCON:
333
- regex_exprs = {
334
- "[\\p{P}\\$\\+<=>\\^~\\|`]+",
335
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
336
- "[0-9][0-9][0-9]",
337
- };
338
- break;
339
- case LLAMA_VOCAB_PRE_TYPE_STARCODER:
340
- case LLAMA_VOCAB_PRE_TYPE_REFACT:
341
- case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
342
- case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
343
- case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
344
- case LLAMA_VOCAB_PRE_TYPE_EXAONE:
345
- case LLAMA_VOCAB_PRE_TYPE_MINERVA:
346
- regex_exprs = {
347
- "\\p{N}",
348
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
349
- };
350
- break;
351
- case LLAMA_VOCAB_PRE_TYPE_GPT2:
352
- case LLAMA_VOCAB_PRE_TYPE_MPT:
353
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
354
- case LLAMA_VOCAB_PRE_TYPE_JAIS:
355
- case LLAMA_VOCAB_PRE_TYPE_TRILLION:
356
- case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
357
- regex_exprs = {
358
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
359
- };
360
- break;
361
- case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
362
- case LLAMA_VOCAB_PRE_TYPE_QWEN2:
363
- case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
364
- case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
365
- regex_exprs = {
366
- // original regex from tokenizer.json
367
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
368
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
369
- };
370
- break;
371
- case LLAMA_VOCAB_PRE_TYPE_PORO:
372
- case LLAMA_VOCAB_PRE_TYPE_BLOOM:
373
- case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
374
- regex_exprs = {
375
- " ?[^(\\s|.,!?…。,、।۔،)]+",
376
- };
377
- break;
378
- case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
379
- regex_exprs = {
380
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
381
- };
382
- break;
383
- case LLAMA_VOCAB_PRE_TYPE_VIKING:
384
- regex_exprs = {
385
- " ?[^(\\s|.,!?…。,、।۔،)]+",
386
- "\\p{N}",
387
- };
388
- break;
389
- case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
390
- // original regex from tokenizer.json
391
- // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
392
- regex_exprs = {
393
- "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
394
- };
395
- break;
396
- case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
397
- // Note: in theory, the special token (sentinel and image token) regex_exprs below
398
- // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
399
- // However, since the upstream pre-tokenizer uses them, they are also
400
- // included here (see https://huggingface.co/facebook/chameleon-7b).
401
- regex_exprs = {
402
- "<sentinel:[0-9]+>", // Sentinel tokens
403
- "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
404
- "([\\t\\n]| | )", // directly from tokenizer.json
405
- "\\p{N}", // Individual digits
406
- "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
407
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
408
- };
409
- break;
410
- case LLAMA_VOCAB_PRE_TYPE_GPT4O:
411
- case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
412
- regex_exprs = {
413
- // original regex from tokenizer.json
414
- // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
415
- "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
416
- };
417
- break;
418
- case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
419
- regex_exprs = {
420
- // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
421
- // The custom handler implements all K2 patterns with proper Han character exclusion
422
- "\\p{Han}+",
423
- };
424
- break;
425
- case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
426
- regex_exprs = {
427
- "\\p{N}+",
428
- "(?=(\\d{3})+(?!\\d))",
429
- };
430
- break;
431
- case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
432
- regex_exprs = {
433
- // original regex from tokenizer.json
434
- // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
435
- // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
436
- "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
437
- };
438
- break;
439
- case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
440
- regex_exprs = {
441
- // original regex from tokenizer.json
442
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
443
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
444
- };
445
- break;
446
- case LLAMA_VOCAB_PRE_TYPE_GROK_2:
447
- regex_exprs = {
448
- // original regex from tokenizer.json
449
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
450
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
451
- };
452
- break;
453
- case LLAMA_VOCAB_PRE_TYPE_AFMOE:
454
- regex_exprs = {
455
- // Digit handling - uses custom implementation in unicode.cpp
456
- // Groups digits with leading 1-2 based on total length modulo 3
457
- "\\p{AFMoE_digits}",
458
- // CJK and Asian scripts (using direct Unicode literals)
459
- "[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ・-゚⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
460
- // Main BPE pattern
461
- "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
462
- };
463
- break;
464
- default:
465
- // default regex for BPE tokenization pre-processing
466
- regex_exprs = {
467
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
468
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
469
- "\\p{N}+",
470
- "[0-9][0-9][0-9]",
471
- };
472
- break;
473
- }
474
- }
475
-
476
- std::vector<std::string> regex_exprs;
477
- };
478
-
479
- struct llm_tokenizer_bpe_session {
480
- llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
481
-
482
- static void append(const llama_token token_id, std::vector<llama_token> & output) {
483
- output.push_back(token_id);
484
- }
485
-
486
- bool append_bos(std::vector<llama_token> & output) const {
487
- if (vocab.get_add_bos()) {
488
- GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
489
- output.push_back(vocab.token_bos());
490
- return true;
491
- }
492
- return false;
493
- }
494
-
495
- bool append_eos(std::vector<llama_token> & output) const {
496
- if (vocab.get_add_eos()) {
497
- GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
498
- output.push_back(vocab.token_eos());
499
- return true;
500
- }
501
- return false;
502
- }
503
-
504
- void check_double_bos_eos(const std::vector<llama_token> & output) const {
505
- if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
506
- LLAMA_LOG_WARN(
507
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
508
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
509
- "Are you sure this is what you want?\n", __FUNCTION__);
510
- }
511
- if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
512
- LLAMA_LOG_WARN(
513
- "%s: Added a EOS token to the prompt as specified by the model but the prompt "
514
- "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
515
- "Are you sure this is what you want?\n", __FUNCTION__);
516
- }
517
- }
518
-
519
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
520
- int final_prev_index = -1;
521
- const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
522
-
523
- symbols_final.clear();
524
-
525
- for (const auto & word : word_collection) {
526
- work_queue = llm_bigram_bpe::queue();
527
- symbols.clear();
528
-
529
- int index = 0;
530
- size_t offset = 0;
531
-
532
- //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
533
- if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
534
- symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
535
- offset = word.size();
536
- }
537
-
538
- while (offset < word.size()) {
539
- llm_symbol sym;
540
- size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
541
- sym.text = word.c_str() + offset;
542
- sym.n = char_len;
543
- offset += sym.n;
544
- sym.prev = index - 1;
545
- sym.next = offset == word.size() ? -1 : index + 1;
546
- index++;
547
- symbols.emplace_back(sym);
548
- }
549
- for (int i = 1; i < (int) symbols.size(); ++i) {
550
- add_new_bigram(i - 1, i);
551
- }
552
-
553
- // build token(s)
554
- while (!work_queue.empty()) {
555
- auto bigram = work_queue.pop_move();
556
-
557
- auto & left_symbol = symbols[bigram.left];
558
- auto & right_symbol = symbols[bigram.right];
559
-
560
- if (left_symbol.n == 0 || right_symbol.n == 0) {
561
- continue;
562
- }
563
- std::string left_token = std::string(left_symbol.text, left_symbol.n);
564
- std::string right_token = std::string(right_symbol.text, right_symbol.n);
565
- if (left_token + right_token != bigram.text) {
566
- continue; // Skip this bigram if it's outdated
567
- }
568
-
569
- // merge the right sym into the left one
570
- left_symbol.n += right_symbol.n;
571
- right_symbol.n = 0;
572
-
573
- // remove the right sym from the chain
574
- left_symbol.next = right_symbol.next;
575
- if (right_symbol.next >= 0) {
576
- symbols[right_symbol.next].prev = bigram.left;
577
- }
578
-
579
- add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
580
- add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
581
- }
582
-
583
- // add the finished tokens to the final list keeping correct order for next and prev
584
- for (auto & sym : symbols) {
585
- if (sym.n > 0) {
586
- sym.prev = final_prev_index;
587
- sym.next = -1;
588
- if (final_prev_index != -1) {
589
- symbols_final[final_prev_index].next = symbols_final.size();
590
- }
591
- symbols_final.emplace_back(sym);
592
- final_prev_index = symbols_final.size() - 1;
593
- }
594
- }
595
- }
596
-
597
- symbols = symbols_final;
598
-
599
- if (!symbols.empty()) {
600
- for (int i = 0; i != -1; i = symbols[i].next) {
601
- auto & symbol = symbols[i];
602
- if (symbol.n == 0) {
603
- continue;
604
- }
605
-
606
- const std::string str = std::string(symbol.text, symbol.n);
607
- const auto token = vocab.text_to_token(str);
608
-
609
- if (token == LLAMA_TOKEN_NULL) {
610
- for (auto j = str.begin(); j != str.end(); ++j) {
611
- std::string byte_str(1, *j);
612
- auto token_multibyte = vocab.text_to_token(byte_str);
613
- if (token_multibyte != LLAMA_TOKEN_NULL) {
614
- output.push_back(token_multibyte);
615
- }
616
- }
617
- } else {
618
- output.push_back(token);
619
- }
620
- }
621
- }
622
- }
623
-
624
- private:
625
- void add_new_bigram(int left, int right) {
626
- if (left == -1 || right == -1) {
627
- return;
628
- }
629
- std::string left_token = std::string(symbols[left].text, symbols[left].n);
630
- std::string right_token = std::string(symbols[right].text, symbols[right].n);
631
-
632
- int rank_found = -1;
633
-
634
- rank_found = vocab.find_bpe_rank(left_token, right_token);
635
-
636
- if (rank_found < 0) {
637
- return;
638
- }
639
-
640
- llm_bigram_bpe bigram;
641
-
642
- bigram.left = left;
643
- bigram.right = right;
644
- bigram.text = left_token + right_token;
645
- bigram.size = left_token.size() + right_token.size();
646
- bigram.rank = rank_found;
647
-
648
- work_queue.push(bigram);
649
- }
650
-
651
- const llama_vocab & vocab;
652
- const llm_tokenizer_bpe & tokenizer;
653
-
654
- std::vector<llm_symbol> symbols;
655
- std::vector<llm_symbol> symbols_final;
656
- llm_bigram_bpe::queue work_queue;
657
- };
658
-
659
- //
660
- // WPM tokenizer
661
- //
662
-
663
- struct llm_tokenizer_wpm : llm_tokenizer {
664
- llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
665
- };
666
-
667
- struct llm_tokenizer_wpm_session {
668
- llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
669
-
670
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
671
- // normalize and split by whitespace
672
- std::vector<std::string> words = preprocess(text);
673
- // bos token prepended already
674
-
675
- // find the longest tokens that form the words
676
- for (const std::string & word : words) {
677
- // skip empty words
678
- if (word.size() == 0) {
679
- continue;
680
- }
681
-
682
- // prepend phantom space
683
- const std::string word1 = "\xe2\x96\x81" + word;
684
- const int n = word1.size();
685
-
686
- const size_t current_tokens = output.size();
687
-
688
- // we're at the start of a new word
689
- // move through character position in word
690
- for (int i = 0; i < n; ++i) {
691
- // loop through possible match length
692
- bool match = false;
693
- for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
694
- auto id = vocab.text_to_token(word1.substr(i, j - i));
695
- if (id != LLAMA_TOKEN_NULL) {
696
- output.push_back(id);
697
- match = true;
698
- i = j - 1;
699
- break;
700
- }
701
- }
702
-
703
- if (!match) { // discard all
704
- output.resize(current_tokens);
705
- break; // and discard next tokens
706
- }
707
- }
708
-
709
- // we didn't find any matches for this word
710
- if (current_tokens == output.size()) {
711
- output.push_back(vocab.token_unk());
712
- }
713
- }
714
- }
715
-
716
- // TODO: reduce string copies by using cpts_offs array
717
- static std::vector<std::string> preprocess(const std::string & text) {
718
- const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
719
- std::vector<std::string> words(1, "");
720
-
721
- for (const uint32_t cpt : cpts_nfd) {
722
- const auto flags = unicode_cpt_flags_from_cpt(cpt);
723
-
724
- if (flags.is_whitespace) {
725
- if (words.back().size()) { // finish previous word if any
726
- words.emplace_back();
727
- }
728
- continue;
729
- }
730
-
731
- assert (!flags.is_separator);
732
- if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
733
- continue;
734
- }
735
-
736
- const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
737
- if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
738
- if (words.back().size()) { // finish previous word if any
739
- words.emplace_back();
740
- }
741
- words.back() = s; // single char word
742
- words.emplace_back(); // start a new word
743
- } else {
744
- words.back() += s; // append char to word
745
- }
746
- }
747
-
748
- if (!words.back().size()) {
749
- words.pop_back();
750
- }
751
-
752
- return words;
753
- }
754
-
755
- static bool is_chinese_char(uint32_t cpt) {
756
- return
757
- (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
758
- (cpt >= 0x03400 && cpt <= 0x04DBF) ||
759
- (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
760
- (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
761
- (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
762
- (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
763
- (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
764
- (cpt >= 0x2F800 && cpt <= 0x2FA1F);
765
- //(cpt >= 0x3000 && cpt <= 0x303F) ||
766
- //(cpt >= 0xFF00 && cpt <= 0xFFEF);
767
- }
768
-
769
- private:
770
- const llama_vocab & vocab;
771
- // currently unused
772
- // const llm_tokenizer_wpm * wpm_tokenizer;
773
- };
774
-
775
- //
776
- // UGM tokenizer
777
- //
778
-
779
- struct llm_tokenizer_ugm : llm_tokenizer {
780
- llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
781
- if (precompiled_charsmap.size() > 0) {
782
- size_t charsmap_offset = 0;
783
-
784
- // First four bytes of precompiled_charsmap contains length of binary
785
- // blob containing XOR-compressed compact double array (XCDA) entries
786
- uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
787
- charsmap_offset += sizeof(xcda_blob_size);
788
- if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
789
- throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
790
- }
791
-
792
- // Next xcda_blob_size bytes contain entries of XOR-compressed compact
793
- // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
794
- xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
795
- xcda_array_size = xcda_blob_size / sizeof(uint32_t);
796
- charsmap_offset += xcda_blob_size;
797
-
798
- // Remaining bytes of precompiled charsmap contain null-terminated
799
- // replacement strings for prefixes matched by the XCDA.
800
- prefix_replacements = &precompiled_charsmap[charsmap_offset];
801
- prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
802
- }
803
-
804
- for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
805
- const auto & token_data = vocab.get_token_data(id);
806
-
807
- if (vocab.is_normal(id)) {
808
- min_score = std::min<float>(min_score, token_data.score);
809
- max_score = std::max<float>(max_score, token_data.score);
810
- }
811
-
812
- if (vocab.is_normal(id) ||
813
- vocab.is_user_defined(id) ||
814
- vocab.is_unused(id)) {
815
- token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
816
- }
817
-
818
- if (vocab.is_user_defined(id)) {
819
- user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
820
- }
821
- }
822
-
823
- unknown_token_score = min_score - unknown_token_score_penalty;
824
- }
825
-
826
- // escaped space symbol - U+2581 (Lower One Eighth Block)
827
- const std::string escaped_space = "\xE2\x96\x81";
828
-
829
- const char * prefix_replacements = NULL;
830
- size_t prefix_replacements_size = 0;
831
-
832
- const uint32_t * xcda_array = NULL;
833
- size_t xcda_array_size = 0;
834
-
835
- struct naive_trie user_defined_token_matcher;
836
-
837
- float min_score = FLT_MAX;
838
- float max_score = -FLT_MAX;
839
-
840
- float unknown_token_score_penalty = 10.0;
841
- float unknown_token_score;
842
-
843
- struct naive_trie token_matcher;
844
- };
845
-
846
- struct llm_tokenizer_ugm_session {
847
- llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
848
-
849
- /* This implementation is based on SentencePiece optimized Viterbi algorithm for
850
- * unigram language models. The general idea is to:
851
- * - move along the input sequence in steps of one UTF code point,
852
- * - at each step find all possible tokenizations of the prefix by
853
- * traversing the tokens trie,
854
- * - for each tokenization store the best one so far (by higher score)
855
- * - use the position in sequence after given token as an index to store
856
- * results
857
- * - if there was no valid tokenization of the current UTF code point
858
- * then use unknown token with additional score penalty
859
- * After processing the whole sequence we backtrack from the end to get
860
- * the best tokenization.
861
- */
862
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
863
- // get current size of output (for reversal later)
864
- size_t output_size = output.size();
865
-
866
- // normalize the input first
867
- std::string normalized;
868
- normalize(text, &normalized);
869
- size_t input_len = normalized.size();
870
- if (input_len == 0) {
871
- return;
872
- }
873
-
874
- // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
875
- std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
876
- // at the beginning tokenization score is zero
877
- tokenization_results[0] = { vocab.token_unk(), 0, 0 };
878
-
879
- for (size_t input_offset = 0; input_offset < input_len;) {
880
- size_t prefix_offset = input_offset;
881
- // calculate how many code units are in the currently processed UTF code point
882
- size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
883
-
884
- // traverse the token matcher trie to find a matching token
885
- bool single_codepoint_token_found = false;
886
- const struct best_tokenization & current_best = tokenization_results[input_offset];
887
- const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
888
-
889
- while (prefix_offset <= input_len && node != NULL) {
890
- // check if we found valid token in prefix
891
- if (node->has_value) {
892
- // check if it corresponds to the whole UTF code point
893
- if (prefix_offset - input_offset == n_utf8_code_units) {
894
- single_codepoint_token_found = true;
895
- }
896
- llama_token token_id = node->value;
897
- const auto & token_data = vocab.get_token_data(token_id);
898
-
899
- // we set the user-defined token scores to 0 to make them more likely to be selected
900
- // (normal token scores are log probabilities, so they are negative)
901
- // score type is double here to make tokenization results exactly
902
- // the same as in the HF tokenizer using SentencePiece
903
- const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
904
- const double challenger_score = current_best.score_sum + token_score;
905
- struct best_tokenization & current_champ = tokenization_results[prefix_offset];
906
- if (challenger_score > current_champ.score_sum) {
907
- struct best_tokenization challenger = { token_id, input_offset, challenger_score };
908
- current_champ = challenger;
909
- }
910
- }
911
- node = node->traverse(normalized[prefix_offset++]);
912
- }
913
-
914
- // if we didn't find a valid token corresponding to the whole UTF code point
915
- // then use unknown token as the tokenization of this UTF code point
916
- if (!single_codepoint_token_found) {
917
- const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
918
- prefix_offset = input_offset + n_utf8_code_units;
919
- struct best_tokenization & current_champ = tokenization_results[prefix_offset];
920
- if (challenger_score > current_champ.score_sum) {
921
- struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
922
- current_champ = challenger;
923
- }
924
- }
925
-
926
- // move to the next UTF code point
927
- input_offset += n_utf8_code_units;
928
- }
929
-
930
- // now backtrack from the end to gather token ids of the best tokenization
931
- // merge sequences of consecutive unknown tokens into single unknown tokens
932
- bool is_prev_unknown = false;
933
- for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
934
- bool is_unknown = tokenization.token_id == vocab.token_unk();
935
- if (!(is_prev_unknown && is_unknown)) {
936
- output.push_back(tokenization.token_id);
937
- }
938
- if (tokenization.input_offset == 0) {
939
- break;
940
- }
941
- is_prev_unknown = is_unknown;
942
- }
943
-
944
- // reverse the output since we added tokens starting from the end of the input
945
- std::reverse(output.begin() + output_size, output.end());
946
- }
947
-
948
- private:
949
-
950
- // helper structure for returning normalization results
951
- struct normalization_result {
952
- const char * normalized;
953
- size_t normalized_len;
954
- size_t consumed_input;
955
- };
956
-
957
- void normalize(const std::string& input, std::string * normalized) {
958
- normalized->clear();
959
- normalized->reserve(input.size() * 3);
960
-
961
- const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
962
-
963
- const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
964
- const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
965
- const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
966
-
967
- bool is_space_prepended = false;
968
- bool processing_non_ws = false;
969
-
970
- size_t input_len = input.size();
971
-
972
- for (size_t input_offset = 0; input_offset < input_len; ) {
973
- auto norm_res = normalize_prefix(input, input_offset);
974
- for (size_t i = 0; i < norm_res.normalized_len; i++) {
975
- char c = norm_res.normalized[i];
976
- if (c != ' ') {
977
- if (!processing_non_ws) {
978
- processing_non_ws = true;
979
- if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
980
- normalized->append(space);
981
- is_space_prepended = true;
982
- }
983
- }
984
- normalized->push_back(c);
985
- } else {
986
- if (processing_non_ws) {
987
- processing_non_ws = false;
988
- }
989
- if (!shall_merge_spaces) {
990
- normalized->append(space);
991
- }
992
- }
993
- }
994
-
995
- input_offset += norm_res.consumed_input;
996
- }
997
-
998
- if (shall_append_space) {
999
- normalized->append(space);
1000
- }
1001
- }
1002
-
1003
- /*
1004
- * This structure is a view wrapper for XOR-compressed double array (XCDA)
1005
- * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
1006
- * Each bit-packed entry contains:
1007
- * - BASE array value in bits 10-30
1008
- * - LCHECK array value in bits 0-7
1009
- * - LEAF array value in bit 9
1010
- * Entries containing indexes of replacement sequences have set bit 31
1011
- */
1012
- struct xcda_array_view {
1013
- public:
1014
- xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
1015
- }
1016
- uint32_t get_base(size_t index) {
1017
- uint32_t packed_node = get_node(index);
1018
- return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
1019
- }
1020
- uint32_t get_lcheck(size_t index) {
1021
- uint32_t packed_node = get_node(index);
1022
- return packed_node & ((1U << 31) | 0xff);
1023
- }
1024
- bool get_leaf(size_t index) {
1025
- uint32_t packed_node = get_node(index);
1026
- return (packed_node >> 8) & 1;
1027
- }
1028
- uint32_t get_value(size_t index) {
1029
- uint32_t packed_node = get_node(index);
1030
- return packed_node & ((1U << 31) - 1);
1031
- }
1032
- private:
1033
- uint32_t get_node(size_t index) {
1034
- if (index >= xcda_array_size) {
1035
- throw std::runtime_error("Index out of array bounds in XCDA array!");
1036
- }
1037
- return xcda_array[index];
1038
- }
1039
- const uint32_t * xcda_array;
1040
- size_t xcda_array_size;
1041
- };
1042
-
1043
- // this structure stores the best tokenization so far at input_offset
1044
- struct best_tokenization {
1045
- llama_token token_id;
1046
- size_t input_offset;
1047
- double score_sum;
1048
- };
1049
-
1050
- struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1051
- if (input_offset == input.size()) {
1052
- return { &input[input_offset], 0, 0 };
1053
- }
1054
-
1055
- // if input prefix matches some user-defined token return this token as normalization result
1056
- auto user_defined_token_match =
1057
- tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1058
- if (user_defined_token_match.second > 0) {
1059
- return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1060
- }
1061
-
1062
- size_t longest_prefix_length = 0;
1063
- size_t longest_prefix_offset = 0;
1064
-
1065
- if (tokenizer.xcda_array_size > 0) {
1066
- struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1067
-
1068
- // Find the longest normalized sequence matching the input prefix by walking
1069
- // the XOR-compressed compact double array (XCDA) starting from the root node
1070
- // We find the index of the next node by calculating BASE[s] ^ c where s is
1071
- // the index of the previous node and c is a numerical character value
1072
- uint32_t node_index = 0;
1073
- // get BASE of the root node
1074
- node_index = xcda_view.get_base(node_index);
1075
- for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1076
- unsigned char c = input[prefix_offset];
1077
- if (c == 0) {
1078
- break;
1079
- }
1080
- node_index ^= c;
1081
- // if value of LCHECK is not c it means that this is not a child of
1082
- // the previous node, so we stop matching
1083
- if (xcda_view.get_lcheck(node_index) != c) {
1084
- break;
1085
- }
1086
- bool is_leaf = xcda_view.get_leaf(node_index);
1087
- // get BASE of the current node
1088
- node_index ^= xcda_view.get_base(node_index);
1089
- // if LEAF of the current node is true, it means that its BASE points to the node
1090
- // containing index of replacement sequence for currently matched input prefix
1091
- if (is_leaf)
1092
- {
1093
- longest_prefix_length = prefix_offset - input_offset + 1;
1094
- // get index of replacement sequence for currently matched input prefix
1095
- longest_prefix_offset = xcda_view.get_value(node_index);
1096
- }
1097
- }
1098
- }
1099
-
1100
- if (longest_prefix_length > 0) {
1101
- // we have a match, so return the replacement sequence
1102
- if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1103
- throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1104
- }
1105
- const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1106
- return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1107
- }
1108
-
1109
- // check if the input prefix contains a valid sequence of UTF-8 code units
1110
- try {
1111
- // if yes, return this sequence unmodified
1112
- size_t prefix_offset = input_offset;
1113
- unicode_cpt_from_utf8(input, prefix_offset);
1114
- return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1115
- } catch (std::invalid_argument & /*ex*/) {
1116
- // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1117
- return { "\xEF\xBF\xBD", 3, 1 };
1118
- }
1119
- }
1120
-
1121
- const llama_vocab & vocab;
1122
- const llm_tokenizer_ugm & tokenizer;
1123
- };
1124
-
1125
- //
1126
- // RWKV tokenizer
1127
- //
1128
-
1129
- static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1130
- std::vector<uint8_t> output;
1131
- output.reserve(escaped.size());
1132
-
1133
- // Parser state
1134
- bool escaping = false;
1135
- uint8_t hex_remaining = 0;
1136
- uint8_t hex_acc = 0;
1137
-
1138
- // Step through characters, performing parsing
1139
- for (const char & c : escaped) {
1140
- // If we're parsing a hex code, interpret the next character
1141
- if (hex_remaining != 0) {
1142
- uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
1143
- hex_acc = (hex_acc << 4) + value;
1144
-
1145
- hex_remaining -= 1;
1146
- if (hex_remaining == 0) {
1147
- output.push_back(hex_acc);
1148
- hex_acc = 0;
1149
- }
1150
-
1151
- continue;
1152
- }
1153
-
1154
- // If we got an escape character, interpret it
1155
- if (escaping) {
1156
- if (c == 't') {
1157
- output.push_back('\t');
1158
- } else if (c == 'n') {
1159
- output.push_back('\n');
1160
- } else if (c == 'r') {
1161
- output.push_back('\r');
1162
- } else if (c == 'x') {
1163
- hex_remaining = 2;
1164
- } else {
1165
- output.push_back(c);
1166
- }
1167
-
1168
- escaping = false;
1169
- continue;
1170
- }
1171
-
1172
- if (c == '\\') {
1173
- escaping = true;
1174
- continue;
1175
- }
1176
-
1177
- output.push_back(c);
1178
- }
1179
-
1180
- return output;
1181
- }
1182
-
1183
- struct llm_tokenizer_rwkv : llm_tokenizer {
1184
- llm_tokenizer_rwkv(const llama_vocab & vocab) {
1185
- // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1186
- // For now, we decode the vocab here into the lookup we'll use for tokenization.
1187
-
1188
- // build trie
1189
- for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1190
- const auto & data = vocab.get_token_data(id);
1191
- const auto text = llama_unescape_rwkv_token(data.text);
1192
- token_matcher.insert((const char *) text.data(), text.size(), id);
1193
- }
1194
- }
1195
-
1196
- struct naive_trie token_matcher;
1197
- };
1198
-
1199
- struct llm_tokenizer_rwkv_session {
1200
- llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1201
-
1202
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
1203
- uint32_t position = 0;
1204
- while (position < text.size()) {
1205
- const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
1206
- if (node == NULL) {
1207
- // no matching token found, add unknown token
1208
- output.push_back(vocab.token_unk());
1209
- position += 1;
1210
- continue;
1211
- }
1212
-
1213
- // traverse the trie to find the longest matching token
1214
- uint32_t token_id = 0;
1215
- uint32_t token_length = 0;
1216
- while (node != NULL) {
1217
- if (node->has_value) {
1218
- token_id = node->value;
1219
- token_length = position + 1;
1220
- }
1221
- node = node->traverse(text[++position]);
1222
- }
1223
-
1224
- // add the longest matching token
1225
- output.push_back(token_id);
1226
- position = token_length;
1227
- }
1228
- }
1229
-
1230
- private:
1231
- const llama_vocab & vocab;
1232
- const llm_tokenizer_rwkv & tokenizer;
1233
- };
1234
-
1235
- struct llm_tokenizer_plamo2 : llm_tokenizer {
1236
- llm_tokenizer_plamo2(const llama_vocab & vocab) {
1237
- build(vocab);
1238
- }
1239
-
1240
- void build(const llama_vocab & vocab) {
1241
- // Reset internal structures
1242
- tokens_.clear();
1243
- bytes_.assign(256, 0);
1244
- to_suffix_id_.clear();
1245
- table_.clear();
1246
-
1247
- // Build token list and byte mapping
1248
- std::unordered_map<std::string, float> suffix_to_score;
1249
- std::unordered_map<std::string, llama_token> token_to_id;
1250
-
1251
- for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1252
- const auto & entry = vocab.get_token_data(token_id);
1253
- tokens_.push_back(entry.text);
1254
- token_to_id[entry.text] = static_cast<llama_token>(token_id);
1255
-
1256
- // Handle byte tokens
1257
- if (vocab.is_byte(token_id)) {
1258
- if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1259
- std::string hex_str = entry.text.substr(3, 2);
1260
- int byte_val = std::stoi(hex_str, nullptr, 16);
1261
- bytes_[byte_val] = static_cast<llama_token>(token_id);
1262
- }
1263
- continue;
1264
- }
1265
-
1266
- // Add token and all its suffixes to suffix_to_score
1267
- suffix_to_score[entry.text] = entry.score;
1268
-
1269
- // Extract suffixes character by character (UTF-8 aware)
1270
- std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1271
- for (size_t i = 1; i < cpts.size(); ++i) {
1272
- std::string suffix;
1273
- for (size_t j = i; j < cpts.size(); ++j) {
1274
- suffix += unicode_cpt_to_utf8(cpts[j]);
1275
- }
1276
- if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1277
- suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1278
- }
1279
- }
1280
- }
1281
-
1282
- // Check that all byte tokens are set
1283
- for (int i = 0; i < 256; ++i) {
1284
- if (bytes_[i] == 0) {
1285
- throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1286
- }
1287
- }
1288
-
1289
- // Build suffix list in lexicographical order of reversed strings
1290
- std::vector<std::string> suffixes;
1291
- suffixes.reserve(suffix_to_score.size() + 1);
1292
- for (const auto & pair : suffix_to_score) {
1293
- suffixes.push_back(pair.first);
1294
- }
1295
- suffixes.push_back(""); // Empty suffix
1296
-
1297
- std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1298
- std::string rev_a(a.rbegin(), a.rend());
1299
- std::string rev_b(b.rbegin(), b.rend());
1300
- return rev_a < rev_b;
1301
- });
1302
-
1303
- // Build suffix_to_id and to_suffix_id_
1304
- std::unordered_map<std::string, int32_t> suffix_to_id;
1305
- int32_t num_pieces = 0;
1306
-
1307
- for (const auto & suffix : suffixes) {
1308
- suffix_to_id[suffix] = num_pieces;
1309
- if (!suffix.empty()) {
1310
- std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1311
-
1312
- std::string remaining;
1313
- for (size_t i = 1; i < cpts.size(); ++i) {
1314
- remaining += unicode_cpt_to_utf8(cpts[i]);
1315
- }
1316
-
1317
- int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1318
- to_suffix_id_[piece_code] = num_pieces;
1319
-
1320
- // Count number of pieces for this suffix
1321
- int32_t pieces_for_suffix = 1; // sentinel row
1322
- for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1323
- std::string piece;
1324
- for (int32_t i = 0; i < piece_length; ++i) {
1325
- piece += unicode_cpt_to_utf8(cpts[i]);
1326
- }
1327
- if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1328
- pieces_for_suffix++;
1329
- }
1330
- }
1331
- num_pieces += pieces_for_suffix;
1332
- } else {
1333
- num_pieces++; // Empty suffix contributes one piece (sentinel row)
1334
- }
1335
- }
1336
-
1337
- // Build flattened table
1338
- table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1339
- int32_t table_idx = 0;
1340
-
1341
- for (const auto & suffix : suffixes) {
1342
- // Add all prefixes of the suffix to the table (in decreasing order of length)
1343
- std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1344
- for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1345
- std::string piece;
1346
- for (int32_t i = 0; i < piece_length; ++i) {
1347
- piece += unicode_cpt_to_utf8(cpts[i]);
1348
- }
1349
-
1350
- auto score_it = suffix_to_score.find(piece);
1351
- if (score_it == suffix_to_score.end()) {
1352
- continue;
1353
- }
1354
-
1355
- table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1356
- auto token_it = token_to_id.find(piece);
1357
- table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1358
-
1359
- float score = score_it->second;
1360
- table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1361
- static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1362
- table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1363
-
1364
- table_idx++;
1365
- }
1366
-
1367
- // Add sentinel row
1368
- table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1369
- table_[table_idx][TABLE_TOKEN_ID] = -1;
1370
- table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1371
- table_idx++;
1372
- }
1373
- }
1374
-
1375
- std::vector<llama_token> encode(const std::string & text) const {
1376
- std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1377
- // Skip the first code point if it is a BOM (Byte Order Mark)
1378
- if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1379
- unicode_data.erase(unicode_data.begin());
1380
- }
1381
-
1382
- if (unicode_data.empty()) {
1383
- return {};
1384
- }
1385
-
1386
- const size_t data_len = unicode_data.size();
1387
-
1388
- // Initialize scores array (dynamic programming)
1389
- std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1390
- scores[data_len] = 0;
1391
-
1392
- // Path array to track best tokenization
1393
- std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1394
-
1395
- int32_t suffix_id = 0;
1396
-
1397
- // Process from end to beginning
1398
- for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1399
- uint32_t c = unicode_data[i];
1400
-
1401
- // Find next suffix ID
1402
- for (size_t p = suffix_id; p < table_.size(); ++p) {
1403
- int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1404
- auto it = to_suffix_id_.find(piece_code);
1405
- suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1406
-
1407
- if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1408
- break;
1409
- }
1410
- }
1411
-
1412
- // Update best path
1413
- for (size_t p = suffix_id; p < table_.size(); ++p) {
1414
- int32_t score = table_[p][TABLE_SCORE];
1415
- if (score > INVALID_SCORE) {
1416
- int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1417
- int64_t s = scores[i + piece_length] - score;
1418
-
1419
- if (s < scores[i]) {
1420
- scores[i] = s;
1421
- path[i][PATH_TOKEN_LENGTH] = piece_length;
1422
- path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1423
- path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1424
-
1425
- if (score == UNKNOWN_SCORE) {
1426
- // Add UTF-8 byte count
1427
- path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1428
- }
1429
- }
1430
- }
1431
-
1432
- if (score == UNKNOWN_SCORE) {
1433
- break;
1434
- }
1435
- }
1436
- }
1437
-
1438
- // Decode the best path
1439
- std::vector<llama_token> token_ids;
1440
- token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1441
-
1442
- int pos = 0;
1443
- while (pos < static_cast<int>(data_len)) {
1444
- if (path[pos][PATH_TOKEN_ID] >= 0) {
1445
- token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1446
- } else {
1447
- // Fall back to byte tokens
1448
- uint32_t c = unicode_data[pos];
1449
- int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1450
-
1451
- for (int i = 0; i < s; ++i) {
1452
- uint8_t b;
1453
- if (s == 1) {
1454
- b = c;
1455
- } else {
1456
- if (i == 0) {
1457
- b = (0xF00 >> s) & 0xFF;
1458
- } else {
1459
- b = 0x80;
1460
- }
1461
- }
1462
- token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1463
- }
1464
- }
1465
-
1466
- assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1467
- pos += path[pos][PATH_TOKEN_LENGTH];
1468
- }
1469
-
1470
- return token_ids;
1471
- }
1472
- private:
1473
- // Constants for table structure
1474
- static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1475
- static constexpr int32_t TABLE_TOKEN_ID = 1;
1476
- static constexpr int32_t TABLE_SCORE = 2;
1477
- static constexpr int32_t TABLE_PIECE_ID = 3;
1478
-
1479
- // Constants for path array
1480
- static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1481
- static constexpr int32_t PATH_TOKEN_ID = 1;
1482
- static constexpr int32_t PATH_NUM_TOKENS = 2;
1483
-
1484
- // Score constants
1485
- static constexpr int32_t INVALID_SCORE = -20000000;
1486
- static constexpr int32_t UNKNOWN_SCORE = -10000000;
1487
-
1488
- // List of tokens in the vocabulary
1489
- std::vector<std::string> tokens_;
1490
-
1491
- // Mapping from byte code point to token ID (for byte fallback)
1492
- std::vector<llama_token> bytes_;
1493
-
1494
- // Mapping from piece code to suffix ID
1495
- std::unordered_map<int64_t, int32_t> to_suffix_id_;
1496
-
1497
- // Flattened table representing the Trie structure
1498
- // Each row contains: [piece_length, token_id, score, piece_id]
1499
- std::vector<std::vector<int32_t>> table_;
1500
- };
1501
-
1502
- struct llm_tokenizer_plamo2_session {
1503
- llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1504
-
1505
- void tokenize(const std::string & text, std::vector<llama_token> & output) {
1506
- std::vector<llama_token> tokens = tokenizer.encode(text);
1507
- output.insert(output.end(), tokens.begin(), tokens.end());
1508
- }
1509
-
1510
- private:
1511
- const llm_tokenizer_plamo2 & tokenizer;
1512
- };
1513
-
1514
- //
1515
- // impl
1516
- //
1517
-
1518
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1519
- FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1520
- FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1521
- } FRAGMENT_BUFFER_VARIANT_TYPE;
1522
-
1523
- struct fragment_buffer_variant {
1524
- fragment_buffer_variant(llama_token _token)
1525
- :
1526
- type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1527
- token(_token),
1528
- raw_text(_dummy),
1529
- offset(0),
1530
- length(0) {}
1531
-
1532
- fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1533
- :
1534
- type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1535
- token((llama_token) - 1),
1536
- raw_text(_raw_text),
1537
- offset(_offset),
1538
- length(_length){
1539
- GGML_ASSERT(_offset >= 0);
1540
- GGML_ASSERT(_length >= 1);
1541
- GGML_ASSERT(offset + length <= raw_text.length());
1542
- }
1543
-
1544
- const FRAGMENT_BUFFER_VARIANT_TYPE type;
1545
- const llama_token token;
1546
- const std::string _dummy;
1547
- const std::string & raw_text;
1548
- const uint64_t offset;
1549
- const uint64_t length;
1550
- };
1551
-
1552
- struct llama_vocab::impl {
1553
- uint32_t n_token_types = 0; // for BERT-style token types
1554
-
1555
- std::string tokenizer_model;
1556
- std::string tokenizer_pre;
1557
-
1558
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1559
- enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1560
-
1561
- int max_token_len = 0; // used for optimizing longest token search
1562
-
1563
- // default LLaMA special tokens
1564
- // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1565
- llama_token special_bos_id = 1;
1566
- llama_token special_eos_id = 2;
1567
- llama_token special_eot_id = LLAMA_TOKEN_NULL;
1568
- llama_token special_eom_id = LLAMA_TOKEN_NULL;
1569
- llama_token special_unk_id = 0;
1570
- llama_token special_sep_id = LLAMA_TOKEN_NULL;
1571
- llama_token special_pad_id = LLAMA_TOKEN_NULL;
1572
- llama_token special_mask_id = LLAMA_TOKEN_NULL;
1573
-
1574
- llama_token linefeed_id = 13;
1575
-
1576
- // fim tokens
1577
- llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1578
- llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1579
- llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1580
- llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1581
- llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1582
- llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1583
-
1584
- // tokenizer flags
1585
- bool add_space_prefix = false;
1586
- bool add_bos = false;
1587
- bool add_eos = false;
1588
- bool add_sep = false;
1589
- bool ignore_merges = false;
1590
- bool clean_spaces = false; // clean_up_tokenization_spaces
1591
- bool remove_extra_whitespaces = false;
1592
- bool escape_whitespaces = true;
1593
- bool treat_whitespace_as_suffix = false;
1594
-
1595
- std::unordered_map<std::string, llama_token> token_to_id;
1596
- std::vector<token_data> id_to_token;
1597
-
1598
- std::vector<llama_token> cache_special_tokens;
1599
- std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1600
- struct pair_hash {
1601
- size_t operator()(const std::pair<std::string, std::string> & p) const {
1602
- return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1603
- (std::hash<std::string>{}(p.second) << 1);
1604
- }
1605
- };
1606
- std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1607
-
1608
- // set of all tokens that cause "end of generation"
1609
- std::set<llama_token> special_eog_ids;
1610
-
1611
- std::unique_ptr<llm_tokenizer> tokenizer;
1612
-
1613
- std::vector<char> precompiled_charsmap;
1614
-
1615
- impl(const llama_vocab & vocab) : vocab(vocab) {
1616
- }
1617
-
1618
- ~impl() = default;
1619
-
1620
- void load(llama_model_loader & ml, const LLM_KV & kv);
1621
-
1622
- enum llama_vocab_type get_type() const;
1623
-
1624
- std::string type_name() const;
1625
-
1626
- bool is_normal (llama_token id) const;
1627
- bool is_unknown (llama_token id) const;
1628
- bool is_control (llama_token id) const;
1629
- bool is_byte (llama_token id) const;
1630
- bool is_user_defined(llama_token id) const;
1631
- bool is_unused (llama_token id) const;
1632
- bool is_eog (llama_token id) const;
1633
-
1634
- uint8_t token_to_byte(llama_token id) const;
1635
-
1636
- llama_token_attr token_get_attr(llama_token id) const;
1637
-
1638
- void init_tokenizer(enum llama_vocab_type type);
1639
-
1640
- void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1641
-
1642
- std::string token_to_piece_for_cache(
1643
- llama_token token,
1644
- bool special) const;
1645
-
1646
-
1647
- std::vector<llama_token> tokenize(
1648
- const std::string & raw_text,
1649
- bool add_special,
1650
- bool parse_special = false) const;
1651
-
1652
- int32_t tokenize(
1653
- const char * text,
1654
- int32_t text_len,
1655
- llama_token * tokens,
1656
- int32_t n_tokens_max,
1657
- bool add_special,
1658
- bool parse_special) const;
1659
-
1660
- // does not write null-terminator to buf
1661
- int32_t token_to_piece(
1662
- llama_token token,
1663
- char * buf,
1664
- int32_t length,
1665
- int32_t lstrip,
1666
- bool special) const;
1667
-
1668
- // use cached data
1669
- const std::string & token_to_piece(llama_token token) const;
1670
-
1671
- int32_t detokenize(
1672
- const llama_token * tokens,
1673
- int32_t n_tokens,
1674
- char * text,
1675
- int32_t text_len_max,
1676
- bool remove_special,
1677
- bool unparse_special) const;
1678
-
1679
- std::string detokenize(
1680
- const std::vector<llama_token> & tokens,
1681
- bool special) const;
1682
-
1683
- void print_info() const;
1684
-
1685
- private:
1686
- const llama_vocab & vocab;
1687
- };
1688
-
1689
- void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1690
- struct gguf_context * ctx = ml.meta.get();
1691
-
1692
- // determine vocab type
1693
- {
1694
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1695
- ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1696
-
1697
- ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
1698
-
1699
- if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1700
- type = LLAMA_VOCAB_TYPE_NONE;
1701
-
1702
- // default special tokens
1703
- special_bos_id = LLAMA_TOKEN_NULL;
1704
- special_eos_id = LLAMA_TOKEN_NULL;
1705
- special_unk_id = LLAMA_TOKEN_NULL;
1706
- special_sep_id = LLAMA_TOKEN_NULL;
1707
- special_pad_id = LLAMA_TOKEN_NULL;
1708
- special_mask_id = LLAMA_TOKEN_NULL;
1709
- linefeed_id = LLAMA_TOKEN_NULL;
1710
-
1711
- // read vocab size from metadata
1712
- uint32_t n_tokens = 0;
1713
- if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
1714
- LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1715
- id_to_token.resize(n_tokens);
1716
- }
1717
-
1718
- return;
1719
- }
1720
-
1721
- if (tokenizer_model == "llama") {
1722
- type = LLAMA_VOCAB_TYPE_SPM;
1723
-
1724
- // default special tokens
1725
- special_bos_id = 1;
1726
- special_eos_id = 2;
1727
- special_unk_id = 0;
1728
- special_sep_id = LLAMA_TOKEN_NULL;
1729
- special_pad_id = LLAMA_TOKEN_NULL;
1730
- special_mask_id = LLAMA_TOKEN_NULL;
1731
- } else if (tokenizer_model == "bert") {
1732
- type = LLAMA_VOCAB_TYPE_WPM;
1733
-
1734
- // default special tokens
1735
- special_bos_id = 101;
1736
- special_eos_id = LLAMA_TOKEN_NULL;
1737
- special_unk_id = 100;
1738
- special_sep_id = 102;
1739
- special_pad_id = 0;
1740
- special_mask_id = 103;
1741
-
1742
- add_sep = true;
1743
- } else if (tokenizer_model == "gpt2") {
1744
- type = LLAMA_VOCAB_TYPE_BPE;
1745
-
1746
- // read bpe merges and populate bpe ranks
1747
- const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
1748
- if (merges_keyidx == -1) {
1749
- throw std::runtime_error("cannot find tokenizer merges in model file\n");
1750
- }
1751
-
1752
- const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
1753
- for (int i = 0; i < n_merges; i++) {
1754
- const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
1755
- //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1756
-
1757
- std::string first;
1758
- std::string second;
1759
-
1760
- const size_t pos = word.find(' ', 1);
1761
-
1762
- if (pos != std::string::npos) {
1763
- first = word.substr(0, pos);
1764
- second = word.substr(pos + 1);
1765
- }
1766
-
1767
- bpe_ranks.emplace(std::make_pair(first, second), i);
1768
- }
1769
-
1770
- // default special tokens
1771
- special_bos_id = 11;
1772
- special_eos_id = 11;
1773
- special_unk_id = LLAMA_TOKEN_NULL;
1774
- special_sep_id = LLAMA_TOKEN_NULL;
1775
- special_pad_id = LLAMA_TOKEN_NULL;
1776
- special_mask_id = LLAMA_TOKEN_NULL;
1777
- } else if (tokenizer_model == "t5") {
1778
- type = LLAMA_VOCAB_TYPE_UGM;
1779
-
1780
- // default special tokens
1781
- special_bos_id = LLAMA_TOKEN_NULL;
1782
- special_eos_id = 1;
1783
- special_unk_id = 2;
1784
- special_sep_id = LLAMA_TOKEN_NULL;
1785
- special_pad_id = 0;
1786
- special_mask_id = LLAMA_TOKEN_NULL;
1787
-
1788
- const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1789
- if (precompiled_charsmap_keyidx != -1) {
1790
- const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1791
- GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1792
-
1793
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1794
- const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1795
- precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1796
- #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1797
- // correct endiannes of data in precompiled_charsmap binary blob
1798
- uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1799
- *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1800
- assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1801
- size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1802
- uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1803
- for (size_t i = 0; i < xcda_array_size; ++i) {
1804
- xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1805
- }
1806
- #endif
1807
- }
1808
- } else if (tokenizer_model == "rwkv") {
1809
- type = LLAMA_VOCAB_TYPE_RWKV;
1810
-
1811
- // default special tokens
1812
- special_bos_id = LLAMA_TOKEN_NULL;
1813
- special_eos_id = LLAMA_TOKEN_NULL;
1814
- special_unk_id = LLAMA_TOKEN_NULL;
1815
- special_sep_id = LLAMA_TOKEN_NULL;
1816
- special_pad_id = LLAMA_TOKEN_NULL;
1817
- } else if (tokenizer_model == "plamo2") {
1818
- type = LLAMA_VOCAB_TYPE_PLAMO2;
1819
-
1820
- // PLaMo-2 default special tokens (these will be overridden by model config)
1821
- special_bos_id = 1; // <|plamo:bos|>
1822
- special_eos_id = 2; // <|plamo:eos|>
1823
- special_unk_id = 0; // <|plamo:unk|>
1824
- special_sep_id = LLAMA_TOKEN_NULL;
1825
- special_pad_id = 3; // <|plamo:pad|>
1826
- special_mask_id = LLAMA_TOKEN_NULL;
1827
- } else {
1828
- throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1829
- }
1830
-
1831
- // for now, only BPE models have pre-tokenizers
1832
- if (type == LLAMA_VOCAB_TYPE_BPE) {
1833
- add_space_prefix = false;
1834
- clean_spaces = true;
1835
- if (tokenizer_pre.empty()) {
1836
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1837
- LLAMA_LOG_WARN("%s: \n", __func__);
1838
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1839
- LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1840
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1841
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1842
- LLAMA_LOG_WARN("%s: \n", __func__);
1843
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1844
- } else if (tokenizer_pre == "default") {
1845
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1846
- } else if (
1847
- tokenizer_pre == "llama3" ||
1848
- tokenizer_pre == "llama-v3" ||
1849
- tokenizer_pre == "llama-bpe"||
1850
- tokenizer_pre == "falcon3" ||
1851
- tokenizer_pre == "falcon-h1" ||
1852
- tokenizer_pre == "pixtral" ||
1853
- tokenizer_pre == "midm-2.0" ||
1854
- tokenizer_pre == "lfm2") {
1855
- pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1856
- ignore_merges = true;
1857
- add_bos = true;
1858
- } else if (
1859
- tokenizer_pre == "deepseek-llm") {
1860
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1861
- clean_spaces = false;
1862
- } else if (
1863
- tokenizer_pre == "deepseek-coder") {
1864
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1865
- clean_spaces = false;
1866
- } else if (
1867
- tokenizer_pre == "deepseek-v3") {
1868
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1869
- clean_spaces = false;
1870
- } else if (
1871
- tokenizer_pre == "youtu") {
1872
- pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1873
- clean_spaces = false;
1874
- ignore_merges = true;
1875
- } else if (
1876
- tokenizer_pre == "falcon") {
1877
- pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1878
- } else if (
1879
- tokenizer_pre == "mpt") {
1880
- pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1881
- } else if (
1882
- tokenizer_pre == "starcoder") {
1883
- pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1884
- } else if (
1885
- tokenizer_pre == "gpt-2" ||
1886
- tokenizer_pre == "phi-2" ||
1887
- tokenizer_pre == "jina-es" ||
1888
- tokenizer_pre == "jina-de" ||
1889
- tokenizer_pre == "gigachat" ||
1890
- tokenizer_pre == "jina-v2-es" ||
1891
- tokenizer_pre == "jina-v2-de" ||
1892
- tokenizer_pre == "a.x-4.0" ||
1893
- tokenizer_pre == "mellum" ||
1894
- tokenizer_pre == "modern-bert" ) {
1895
- pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1896
- } else if (
1897
- tokenizer_pre == "jina-v1-en" ||
1898
- tokenizer_pre == "jina-v2-code" ||
1899
- tokenizer_pre == "roberta-bpe") {
1900
- pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1901
- add_sep = true;
1902
- } else if (
1903
- tokenizer_pre == "refact") {
1904
- pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1905
- } else if (
1906
- tokenizer_pre == "command-r") {
1907
- pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1908
- clean_spaces = false;
1909
- } else if (
1910
- tokenizer_pre == "qwen2" ||
1911
- tokenizer_pre == "deepseek-r1-qwen" ||
1912
- tokenizer_pre == "kormo") {
1913
- pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1914
- clean_spaces = false;
1915
- } else if (
1916
- tokenizer_pre == "stablelm2") {
1917
- pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1918
- } else if (
1919
- tokenizer_pre == "olmo") {
1920
- pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1921
- } else if (
1922
- tokenizer_pre == "dbrx") {
1923
- pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1924
- } else if (
1925
- tokenizer_pre == "smaug-bpe") {
1926
- pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1927
- } else if (
1928
- tokenizer_pre == "poro-chat") {
1929
- pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1930
- clean_spaces = false;
1931
- } else if (
1932
- tokenizer_pre == "glm4" ||
1933
- tokenizer_pre == "chatglm-bpe") {
1934
- pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1935
- special_bos_id = LLAMA_TOKEN_NULL;
1936
- } else if (
1937
- tokenizer_pre == "viking") {
1938
- pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1939
- clean_spaces = false;
1940
- } else if (
1941
- tokenizer_pre == "jais") {
1942
- pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1943
- } else if (
1944
- tokenizer_pre == "tekken") {
1945
- pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1946
- clean_spaces = false;
1947
- ignore_merges = true;
1948
- add_bos = true;
1949
- } else if (
1950
- tokenizer_pre == "smollm") {
1951
- pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1952
- clean_spaces = false;
1953
- } else if (
1954
- tokenizer_pre == "codeshell") {
1955
- pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1956
- } else if (
1957
- tokenizer_pre == "bloom") {
1958
- pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1959
- } else if (
1960
- tokenizer_pre == "gpt3-finnish") {
1961
- pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1962
- } else if (
1963
- tokenizer_pre == "exaone") {
1964
- pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1965
- } else if (
1966
- tokenizer_pre == "exaone4") {
1967
- pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1968
- } else if (
1969
- tokenizer_pre == "chameleon") {
1970
- pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1971
- add_bos = true;
1972
- clean_spaces = false;
1973
- } else if (
1974
- tokenizer_pre == "minerva-7b") {
1975
- pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
1976
- } else if (
1977
- tokenizer_pre == "megrez") {
1978
- pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1979
- } else if (
1980
- tokenizer_pre == "gpt-4o" ||
1981
- tokenizer_pre == "llama4") {
1982
- pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1983
- clean_spaces = false;
1984
- } else if (
1985
- tokenizer_pre == "superbpe") {
1986
- pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1987
- clean_spaces = false;
1988
- } else if (
1989
- tokenizer_pre == "trillion") {
1990
- pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1991
- clean_spaces = false;
1992
- } else if (
1993
- tokenizer_pre == "granite-docling") {
1994
- pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1995
- clean_spaces = false;
1996
- } else if (
1997
- tokenizer_pre == "bailingmoe" ||
1998
- tokenizer_pre == "bailingmoe2" ||
1999
- tokenizer_pre == "llada-moe") {
2000
- pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
2001
- clean_spaces = false;
2002
- } else if (
2003
- tokenizer_pre == "seed-coder") {
2004
- pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
2005
- clean_spaces = false;
2006
- } else if (
2007
- tokenizer_pre == "hunyuan") {
2008
- pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
2009
- clean_spaces = false;
2010
- } else if (
2011
- tokenizer_pre == "hunyuan-dense") {
2012
- pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
2013
- clean_spaces = false;
2014
- } else if (
2015
- tokenizer_pre == "kimi-k2") {
2016
- pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
2017
- clean_spaces = false;
2018
- } else if (
2019
- tokenizer_pre == "grok-2") {
2020
- pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
2021
- clean_spaces = false;
2022
- } else if (
2023
- tokenizer_pre == "afmoe") {
2024
- pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
2025
- clean_spaces = false;
2026
- } else if (
2027
- tokenizer_pre == "minimax-m2") {
2028
- pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
2029
- clean_spaces = false;
2030
- } else if (
2031
- tokenizer_pre == "solar-open") {
2032
- pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
2033
- clean_spaces = false;
2034
- } else {
2035
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2036
- }
2037
- } else if (type == LLAMA_VOCAB_TYPE_SPM) {
2038
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2039
- add_space_prefix = true;
2040
- clean_spaces = false;
2041
- add_bos = true;
2042
- add_eos = false;
2043
- } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2044
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2045
- add_space_prefix = false;
2046
- clean_spaces = true;
2047
- add_bos = true;
2048
- add_eos = false;
2049
- add_sep = true;
2050
- } else if (type == LLAMA_VOCAB_TYPE_UGM) {
2051
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2052
- add_bos = false;
2053
- add_eos = true;
2054
- } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2055
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2056
- add_space_prefix = false;
2057
- clean_spaces = false;
2058
- add_bos = false;
2059
- add_eos = false;
2060
- } else {
2061
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2062
- }
2063
-
2064
- ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
2065
- ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
2066
- }
2067
-
2068
- const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
2069
- if (token_idx == -1) {
2070
- throw std::runtime_error("cannot find tokenizer vocab in model file\n");
2071
- }
2072
-
2073
- const float * scores = nullptr;
2074
- const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
2075
- if (score_idx != -1) {
2076
- scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
2077
- }
2078
-
2079
- const int * toktypes = nullptr;
2080
- const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2081
- if (toktype_idx != -1) {
2082
- toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
2083
- }
2084
-
2085
- uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
2086
- id_to_token.resize(n_tokens);
2087
-
2088
- for (uint32_t i = 0; i < n_tokens; i++) {
2089
- std::string word = gguf_get_arr_str(ctx, token_idx, i);
2090
- if (word.empty()) {
2091
- LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2092
- word = "[EMPTY_" + std::to_string(i) + "]";
2093
- }
2094
-
2095
- token_to_id[word] = i;
2096
- max_token_len = std::max(max_token_len, (int) word.size());
2097
-
2098
- auto & token_data = id_to_token[i];
2099
- token_data.text = std::move(word);
2100
- token_data.score = scores ? scores[i] : 0.0f;
2101
- token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
2102
-
2103
- if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
2104
- switch(toktypes[i]) {
2105
- case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
2106
- case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
2107
- case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
2108
- case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
2109
- case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2110
- case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
2111
- case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2112
- default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2113
- }
2114
- }
2115
- }
2116
- GGML_ASSERT(id_to_token.size() == token_to_id.size());
2117
-
2118
- init_tokenizer(type);
2119
-
2120
- // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2121
- if (type == LLAMA_VOCAB_TYPE_SPM) {
2122
- try {
2123
- linefeed_id = vocab.byte_to_token('\n');
2124
- } catch (const std::exception & e) {
2125
- LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2126
- linefeed_id = special_pad_id;
2127
- }
2128
- } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2129
- linefeed_id = special_pad_id;
2130
- } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2131
- const std::vector<int> ids = tokenize("\n", false);
2132
- GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2133
- linefeed_id = ids[0];
2134
- } else {
2135
- const std::vector<int> ids = tokenize("\n", false);
2136
-
2137
- //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2138
- if (ids.empty()) {
2139
- LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2140
- linefeed_id = special_pad_id;
2141
- } else {
2142
- linefeed_id = ids[0];
2143
- }
2144
- }
2145
-
2146
- // special tokens
2147
- {
2148
- const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2149
- { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
2150
- { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
2151
- { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
2152
- { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
2153
- { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
2154
- { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
2155
- { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
2156
- { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
2157
- { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2158
- { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2159
- { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2160
- { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2161
- { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2162
- { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2163
-
2164
- // deprecated
2165
- { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2166
- { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2167
- { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2168
- };
2169
-
2170
- for (const auto & it : special_token_types) {
2171
- const std::string & key = kv(std::get<0>(it));
2172
- int32_t & id = std::get<1>(it);
2173
-
2174
- uint32_t new_id;
2175
- if (!ml.get_key(std::get<0>(it), new_id, false)) {
2176
- continue;
2177
- }
2178
- if (new_id >= id_to_token.size()) {
2179
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2180
- __func__, key.c_str(), new_id, id);
2181
- } else {
2182
- id = new_id;
2183
- }
2184
- }
2185
-
2186
- // Handle add_bos, add_eos and add_sep
2187
- {
2188
- bool temp = true;
2189
-
2190
- if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2191
- add_bos = temp;
2192
- }
2193
- if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2194
- add_eos = temp;
2195
- }
2196
- if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
2197
- add_sep = temp;
2198
- }
2199
- }
2200
-
2201
- // auto-detect special tokens by text
2202
- // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2203
- // for now, we apply this workaround to find the tokens based on their text
2204
-
2205
- for (const auto & t : token_to_id) {
2206
- auto & attr = id_to_token[t.second].attr;
2207
-
2208
- // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2209
- if (special_eot_id == LLAMA_TOKEN_NULL) {
2210
- if (false
2211
- || t.first == "<|eot_id|>"
2212
- || t.first == "<|im_end|>"
2213
- || t.first == "<|end|>"
2214
- || t.first == "<end_of_turn>"
2215
- || t.first == "<|endoftext|>"
2216
- || t.first == "<|end_of_text|>" // granite
2217
- || t.first == "<EOT>"
2218
- || t.first == "_<EOT>"
2219
- || t.first == "<|end▁of▁sentence|>" // DeepSeek
2220
- || t.first == "<end_of_utterance>" // smoldocling
2221
- ) {
2222
- special_eot_id = t.second;
2223
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2224
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2225
- __func__, t.second, t.first.c_str());
2226
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2227
- }
2228
- }
2229
- }
2230
-
2231
- // find EOM token: "<|eom_id|>"
2232
- if (special_eom_id == LLAMA_TOKEN_NULL) {
2233
- if (false
2234
- || t.first == "<|eom_id|>"
2235
- ) {
2236
- special_eom_id = t.second;
2237
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2238
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2239
- __func__, t.second, t.first.c_str());
2240
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2241
- }
2242
- }
2243
- }
2244
-
2245
- // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
2246
- if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2247
- if (false
2248
- || t.first == "<|fim_prefix|>" // Qwen
2249
- || t.first == "<fim-prefix>"
2250
- || t.first == "<fim_prefix>" // Granite
2251
- || t.first == "<|fim▁begin|>" // DeepSeek
2252
- || t.first == "<PRE>"
2253
- || t.first == "▁<PRE>" // CodeLlama
2254
- || t.first == "<|code_prefix|>" // GLM-4.5
2255
- ) {
2256
- special_fim_pre_id = t.second;
2257
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2258
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2259
- __func__, t.second, t.first.c_str());
2260
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2261
- }
2262
- }
2263
- }
2264
-
2265
- // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
2266
- if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2267
- if (false
2268
- || t.first == "<|fim_suffix|>" // Qwen
2269
- || t.first == "<fim-suffix>"
2270
- || t.first == "<fim_suffix>" // Granite
2271
- || t.first == "<|fim▁hole|>" // DeepSeek
2272
- || t.first == "<SUF>"
2273
- || t.first == "▁<SUF>" // CodeLlama
2274
- || t.first == "<|code_suffix|>" // GLM-4.5
2275
- ) {
2276
- special_fim_suf_id = t.second;
2277
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2278
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2279
- __func__, t.second, t.first.c_str());
2280
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2281
- }
2282
- }
2283
- }
2284
-
2285
- // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
2286
- if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2287
- if (false
2288
- || t.first == "<|fim_middle|>" // Qwen
2289
- || t.first == "<fim-middle>"
2290
- || t.first == "<fim_middle>" // Granite
2291
- || t.first == "<|fim▁end|>" // DeepSeek
2292
- || t.first == "<MID>"
2293
- || t.first == "▁<MID>" // CodeLlama
2294
- || t.first == "<|code_middle|>" // GLM-4.5
2295
- ) {
2296
- special_fim_mid_id = t.second;
2297
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2298
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2299
- __func__, t.second, t.first.c_str());
2300
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2301
- }
2302
- }
2303
- }
2304
-
2305
- // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
2306
- if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2307
- if (false
2308
- || t.first == "<|fim_pad|>" // Qwen
2309
- || t.first == "<fim-pad>"
2310
- || t.first == "<fim_pad>" // Granite
2311
- || t.first == "<PAD>"
2312
- ) {
2313
- special_fim_pad_id = t.second;
2314
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2315
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2316
- __func__, t.second, t.first.c_str());
2317
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2318
- }
2319
- }
2320
- }
2321
-
2322
- // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
2323
- if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2324
- if (false
2325
- || t.first == "<|fim_repo|>" // Qwen
2326
- || t.first == "<|repo_name|>"
2327
- || t.first == "<fim-repo>"
2328
- || t.first == "<REPO>"
2329
- || t.first == "<reponame>" // Granite
2330
- ) {
2331
- special_fim_rep_id = t.second;
2332
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2333
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2334
- __func__, t.second, t.first.c_str());
2335
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2336
- }
2337
- }
2338
- }
2339
-
2340
- // find FIM_SEP token: "<|file_sep|>"
2341
- if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2342
- if (false
2343
- || t.first == "<|file_sep|>" // Qwen
2344
- ) {
2345
- special_fim_sep_id = t.second;
2346
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2347
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2348
- __func__, t.second, t.first.c_str());
2349
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2350
- }
2351
- }
2352
- }
2353
- }
2354
-
2355
- // auto-detect unused tokens: e.g. control tokens with the word "unused"
2356
- // ideally, these tokens should be marked as unused during conversion
2357
- {
2358
- uint32_t n_unused = 0;
2359
-
2360
- for (const auto & t : token_to_id) {
2361
- auto & attr = id_to_token[t.second].attr;
2362
-
2363
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2364
- continue;
2365
- }
2366
-
2367
- if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
2368
- if (strstr(t.first.c_str(), "unused") != NULL) {
2369
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
2370
- }
2371
- }
2372
-
2373
- if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
2374
- n_unused++;
2375
- }
2376
- }
2377
-
2378
- LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
2379
- }
2380
-
2381
- // maintain a list of tokens that cause end-of-generation
2382
- // this is currently determined based on the token text, which is obviously not ideal
2383
- // ref: https://github.com/ggerganov/llama.cpp/issues/9606
2384
- special_eog_ids.clear();
2385
-
2386
- if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
2387
- special_eog_ids.insert(special_fim_pad_id);
2388
- }
2389
-
2390
- if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
2391
- special_eog_ids.insert(special_fim_rep_id);
2392
- }
2393
-
2394
- if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
2395
- special_eog_ids.insert(special_fim_sep_id);
2396
- }
2397
-
2398
- for (const auto & t : token_to_id) {
2399
- auto & attr = id_to_token[t.second].attr;
2400
-
2401
- if (false
2402
- || t.first == "<|eot_id|>"
2403
- || t.first == "<|im_end|>"
2404
- || t.first == "<|end|>"
2405
- || t.first == "<|return|>" // o200k_harmony
2406
- || t.first == "<|call|>" // o200k_harmony
2407
- || t.first == "<|flush|>" // solar-open
2408
- || t.first == "<|calls|>" // solar-open
2409
- || t.first == "<end_of_turn>"
2410
- || t.first == "<|endoftext|>"
2411
- || t.first == "<|eom_id|>"
2412
- || t.first == "<EOT>"
2413
- || t.first == "_<EOT>"
2414
- || t.first == "<|end_of_text|>"
2415
- || t.first == "<end_of_utterance>" // smoldocling
2416
- ) {
2417
- special_eog_ids.insert(t.second);
2418
- if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2419
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2420
- __func__, t.second, t.first.c_str());
2421
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2422
- }
2423
- } else {
2424
- if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
2425
- // token is control, but not marked as EOG -> print a debug log
2426
- if (special_eog_ids.count(t.second) == 0) {
2427
- LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2428
- __func__, t.second, t.first.c_str());
2429
- }
2430
- }
2431
- }
2432
- }
2433
-
2434
- // @ngxson : quick hack for gpt-oss, always render these tokens
2435
- for (const auto & t : token_to_id) {
2436
- auto & attr = id_to_token[t.second].attr;
2437
-
2438
- if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2439
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2440
- }
2441
- }
2442
-
2443
- // sanity checks
2444
- if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2445
- special_eog_ids.insert(special_eos_id);
2446
- LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2447
- }
2448
-
2449
- if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
2450
- special_eog_ids.insert(special_eot_id);
2451
- LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2452
- }
2453
-
2454
- if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
2455
- special_eog_ids.insert(special_eom_id);
2456
- LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2457
- }
2458
-
2459
- // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
2460
- // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
2461
- // we remove the "<|end|>" token from the EOG list
2462
- {
2463
- bool has_return = false;
2464
- bool has_call = false;
2465
- bool has_end = false;
2466
- bool has_flush = false;
2467
-
2468
- llama_token end_id = LLAMA_TOKEN_NULL;
2469
-
2470
- LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2471
- for (auto tid : special_eog_ids) {
2472
- auto & text = id_to_token[tid].text;
2473
-
2474
- LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
2475
-
2476
- if (text == "<|return|>") {
2477
- has_return = true;
2478
- } else if (text == "<|call|>" || text == "<|calls|>") {
2479
- has_call = true;
2480
- } else if (text == "<|flush|>") {
2481
- has_flush = true;
2482
- } else if (text == "<|end|>") {
2483
- has_end = true;
2484
- end_id = tid;
2485
- }
2486
- }
2487
-
2488
- if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
2489
- special_eog_ids.erase(end_id);
2490
-
2491
- auto & attr = id_to_token[end_id].attr;
2492
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2493
-
2494
- LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2495
- }
2496
- }
2497
- }
2498
-
2499
- // build special tokens cache
2500
- {
2501
- for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
2502
- if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
2503
- cache_special_tokens.push_back(id);
2504
- }
2505
- }
2506
-
2507
- std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
2508
- [&] (const llama_token a, const llama_token b) {
2509
- return id_to_token[a].text.size() > id_to_token[b].text.size();
2510
- }
2511
- );
2512
-
2513
- LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2514
- }
2515
-
2516
- // build token to piece cache
2517
- {
2518
- size_t size_cache = 0;
2519
-
2520
- std::vector<std::string> cache(n_tokens);
2521
-
2522
- for (uint32_t id = 0; id < n_tokens; ++id) {
2523
- cache[id] = token_to_piece_for_cache(id, true);
2524
-
2525
- size_cache += cache[id].size();
2526
- }
2527
-
2528
- std::swap(cache_token_to_piece, cache);
2529
-
2530
- LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
2531
- }
2532
-
2533
- // Handle per token attributes
2534
- //NOTE: Each model customizes per token attributes.
2535
- //NOTE: Per token attributes are missing from the GGUF file.
2536
- //TODO: Extract attributes from GGUF file.
2537
- {
2538
- auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2539
- for (const auto & substr : substrs) {
2540
- if (str.find(substr) != std::string::npos) {
2541
- return true;
2542
- }
2543
- }
2544
- return false;
2545
- };
2546
-
2547
- auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2548
- uint32_t current = id_to_token.at(id).attr;
2549
- current = value ? (current | attr) : (current & ~attr);
2550
- id_to_token[id].attr = (llama_token_attr) current;
2551
- };
2552
-
2553
- auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2554
- _set_tokenid_attr(token_to_id.at(token), attr, value);
2555
- };
2556
-
2557
- std::string model_name;
2558
- std::string tokenizer_pre;
2559
- std::string general_arch;
2560
-
2561
- ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2562
- ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2563
- ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2564
-
2565
- // model name to lowercase
2566
- std::transform(model_name.begin(), model_name.end(), model_name.begin(),
2567
- [] (const std::string::value_type x) {
2568
- return std::tolower(x);
2569
- }
2570
- );
2571
-
2572
- // set attributes by model/tokenizer/architecture name
2573
- if (false
2574
- || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2575
- || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2576
- ) {
2577
- if (token_to_id.count("<mask>") == 0) {
2578
- LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2579
- } else {
2580
- _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2581
- }
2582
- } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2583
- for (auto id : cache_special_tokens) {
2584
- _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2585
- }
2586
- for (const auto * token : {"</s>"}) {
2587
- _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2588
- }
2589
- for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2590
- _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2591
- }
2592
- } else if (_contains_any(model_name, {"modern-bert"})) {
2593
- if (token_to_id.count("[MASK]") == 0 ) {
2594
- LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
2595
- }
2596
- else {
2597
- _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
2598
- }
2599
- }
2600
- }
2601
- }
2602
-
2603
- enum llama_vocab_type llama_vocab::impl::get_type() const {
2604
- return type;
2605
- }
2606
-
2607
- std::string llama_vocab::impl::type_name() const{
2608
- switch (type) {
2609
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2610
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2611
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2612
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2613
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2614
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2615
- case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2616
- default: return "unknown";
2617
- }
2618
- }
2619
-
2620
- bool llama_vocab::impl::is_normal(llama_token id) const {
2621
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2622
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2623
- }
2624
-
2625
- bool llama_vocab::impl::is_unknown(llama_token id) const {
2626
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2627
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2628
- }
2629
-
2630
- bool llama_vocab::impl::is_control(llama_token id) const {
2631
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2632
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2633
- }
2634
-
2635
- bool llama_vocab::impl::is_byte(llama_token id) const {
2636
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2637
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
2638
- }
2639
-
2640
- bool llama_vocab::impl::is_user_defined(llama_token id) const {
2641
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2642
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2643
- }
2644
-
2645
- bool llama_vocab::impl::is_unused(llama_token id) const {
2646
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2647
- return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2648
- }
2649
-
2650
- bool llama_vocab::impl::is_eog(llama_token id) const {
2651
- return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
2652
- }
2653
-
2654
- uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2655
- GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2656
- GGML_ASSERT(is_byte(id));
2657
- const auto & token_data = id_to_token.at(id);
2658
- switch (get_type()) {
2659
- case LLAMA_VOCAB_TYPE_SPM:
2660
- case LLAMA_VOCAB_TYPE_UGM: {
2661
- auto buf = token_data.text.substr(3, 2);
2662
- return strtol(buf.c_str(), NULL, 16);
2663
- }
2664
- case LLAMA_VOCAB_TYPE_BPE: {
2665
- GGML_ABORT("fatal error");
2666
- }
2667
- case LLAMA_VOCAB_TYPE_WPM: {
2668
- GGML_ABORT("fatal error");
2669
- }
2670
- default:
2671
- GGML_ABORT("fatal error");
2672
- }
2673
- }
2674
-
2675
- llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2676
- GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2677
- return id_to_token.at(id).attr;
2678
- }
2679
-
2680
- void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2681
- LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2682
-
2683
- switch (type) {
2684
- case LLAMA_VOCAB_TYPE_SPM:
2685
- tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
2686
- break;
2687
- case LLAMA_VOCAB_TYPE_BPE:
2688
- tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
2689
- break;
2690
- case LLAMA_VOCAB_TYPE_WPM:
2691
- tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
2692
- break;
2693
- case LLAMA_VOCAB_TYPE_UGM:
2694
- tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
2695
- break;
2696
- case LLAMA_VOCAB_TYPE_RWKV:
2697
- tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2698
- break;
2699
- case LLAMA_VOCAB_TYPE_PLAMO2:
2700
- tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2701
- break;
2702
- default:
2703
- GGML_ABORT("unsupported vocab type");
2704
- }
2705
- }
2706
-
2707
- //
2708
- // (de-) tokenize
2709
- //
2710
-
2711
- // #define PRETOKENIZERDEBUG
2712
-
2713
- void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2714
- // for each special token
2715
- for (const llama_token special_id : cache_special_tokens) {
2716
- const auto & data = vocab.get_token_data(special_id);
2717
- const auto & text = data.text;
2718
-
2719
- if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2720
- // Ignore control and unknown tokens when parse_special == false
2721
- continue;
2722
- // User-defined tokens are still pre-tokenized before everything else
2723
- // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2724
- // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2725
- }
2726
-
2727
- // for each text fragment
2728
- std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2729
- while (it != buffer.end()) {
2730
- auto & fragment = (*it);
2731
-
2732
- // if a fragment is text ( not yet processed )
2733
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2734
- const auto & raw_text = fragment.raw_text;
2735
-
2736
- auto raw_text_base_offset = fragment.offset;
2737
- auto raw_text_base_length = fragment.length;
2738
-
2739
- // loop over the text
2740
- while (true) {
2741
- // find the first occurrence of a given special token in this fragment
2742
- // passing offset argument only limit the "search area" but match coordinates
2743
- // are still relative to the source full raw_text
2744
- // string_view begins at pos 0 for the same reason
2745
- auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2746
-
2747
- // no occurrences found, stop processing this fragment for a given special token
2748
- if (match == std::string::npos) break;
2749
-
2750
- #ifdef PRETOKENIZERDEBUG
2751
- LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2752
- #endif
2753
- auto source = std::distance(buffer.begin(), it);
2754
-
2755
- // if match is further than base offset
2756
- // then we have some text to the left of it
2757
- if (match > raw_text_base_offset) {
2758
- // left
2759
- const int64_t left_reminder_offset = raw_text_base_offset + 0;
2760
- int64_t left_reminder_length = match - raw_text_base_offset;
2761
-
2762
- if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2763
- while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2764
- left_reminder_length--;
2765
- }
2766
- }
2767
-
2768
- if (left_reminder_length > 0) {
2769
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
2770
- it++;
2771
- }
2772
-
2773
- #ifdef PRETOKENIZERDEBUG
2774
- LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2775
- #endif
2776
- }
2777
-
2778
- // special token
2779
- buffer.emplace_after(it, special_id);
2780
- it++;
2781
-
2782
- // right
2783
- if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2784
- int64_t right_reminder_offset = match + text.length();
2785
- int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2786
-
2787
- if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2788
- while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2789
- right_reminder_offset++;
2790
- right_reminder_length--;
2791
- }
2792
- }
2793
-
2794
- if (right_reminder_length > 0) {
2795
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
2796
- it++;
2797
- }
2798
-
2799
- #ifdef PRETOKENIZERDEBUG
2800
- LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2801
- #endif
2802
-
2803
- if (source == 0) {
2804
- buffer.erase_after(buffer.before_begin());
2805
- } else {
2806
- buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2807
- }
2808
-
2809
- // repeat for the right side
2810
- raw_text_base_offset = right_reminder_offset;
2811
- raw_text_base_length = right_reminder_length;
2812
-
2813
- #ifdef PRETOKENIZERDEBUG
2814
- LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2815
- #endif
2816
- } else {
2817
- if (source == 0) {
2818
- buffer.erase_after(buffer.before_begin());
2819
- } else {
2820
- buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2821
- }
2822
- break;
2823
- }
2824
- }
2825
- }
2826
- it++;
2827
- }
2828
- }
2829
- }
2830
-
2831
- // NOTE: avoid ever using this except for building the token_to_piece caches
2832
- std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2833
- std::string piece;
2834
- piece.resize(piece.capacity()); // using string internal cache
2835
- const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2836
- if (n_chars < 0) {
2837
- piece.resize(-n_chars);
2838
- int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2839
- GGML_ASSERT(check == -n_chars);
2840
- }
2841
- else {
2842
- piece.resize(n_chars);
2843
- }
2844
-
2845
- return piece;
2846
- }
2847
-
2848
- static void llama_escape_whitespace(std::string & text) {
2849
- replace_all(text, " ", "\xe2\x96\x81");
2850
- }
2851
-
2852
- static void llama_unescape_whitespace(std::string & word) {
2853
- replace_all(word, "\xe2\x96\x81", " ");
2854
- }
2855
-
2856
- static std::string llama_decode_text(const std::string & text) {
2857
- std::string decoded_text;
2858
-
2859
- const auto cpts = unicode_cpts_from_utf8(text);
2860
- for (const auto cpt : cpts) {
2861
- const auto utf8 = unicode_cpt_to_utf8(cpt);
2862
- try {
2863
- decoded_text += unicode_utf8_to_byte(utf8);
2864
- } catch (const std::out_of_range & /*e*/) {
2865
- decoded_text += "[UNK_BYTE_0x";
2866
- for (const auto c : utf8) {
2867
- decoded_text += format("%02x", (uint8_t) c);
2868
- }
2869
- decoded_text += text + "]";
2870
- }
2871
- }
2872
-
2873
- return decoded_text;
2874
- }
2875
-
2876
- std::vector<llama_token> llama_vocab::impl::tokenize(
2877
- const std::string & raw_text,
2878
- bool add_special,
2879
- bool parse_special) const {
2880
- GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2881
-
2882
- std::vector<llama_token> output;
2883
- std::forward_list<fragment_buffer_variant> fragment_buffer;
2884
-
2885
- if (!raw_text.empty()) {
2886
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
2887
- tokenizer_st_partition(fragment_buffer, parse_special);
2888
- }
2889
-
2890
- switch (get_type()) {
2891
- case LLAMA_VOCAB_TYPE_SPM:
2892
- {
2893
- // OG tokenizer behavior:
2894
- //
2895
- // tokenizer.encode('', add_special_tokens=True) returns [1]
2896
- // tokenizer.encode('', add_special_tokens=False) returns []
2897
-
2898
- bool is_prev_special = true; // prefix with space if first token
2899
-
2900
- if (add_special && add_bos) {
2901
- GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2902
- output.push_back(special_bos_id);
2903
- is_prev_special = true;
2904
- }
2905
-
2906
- for (const auto & fragment : fragment_buffer) {
2907
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2908
- std::string text;
2909
-
2910
- // prefix with space if previous is special
2911
- if (add_space_prefix && is_prev_special) {
2912
- text = ' ';
2913
- }
2914
-
2915
- text += fragment.raw_text.substr(fragment.offset, fragment.length);
2916
-
2917
- #ifdef PRETOKENIZERDEBUG
2918
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2919
- #endif
2920
- llama_escape_whitespace(text);
2921
- llm_tokenizer_spm_session session(vocab);
2922
- session.tokenize(text, output);
2923
- is_prev_special = false;
2924
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2925
- output.push_back(fragment.token);
2926
- is_prev_special = true;
2927
- }
2928
- }
2929
-
2930
- if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2931
- LLAMA_LOG_WARN(
2932
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2933
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2934
- "Are you sure this is what you want?\n", __FUNCTION__);
2935
- }
2936
-
2937
- if (add_special && add_eos) {
2938
- GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2939
- output.push_back(special_eos_id);
2940
- }
2941
- } break;
2942
- case LLAMA_VOCAB_TYPE_BPE:
2943
- {
2944
- llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2945
- // it calls some other methods that are not exist in llm_tokenizer,
2946
- // here just cast it to bpe tokenizer object
2947
- if (add_special) {
2948
- session.append_bos(output);
2949
- }
2950
- for (const auto & fragment : fragment_buffer) {
2951
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2952
- std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2953
-
2954
- #ifdef PRETOKENIZERDEBUG
2955
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2956
- #endif
2957
- session.tokenize(text, output);
2958
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2959
- session.append(fragment.token, output);
2960
- }
2961
- }
2962
-
2963
- if (add_special) {
2964
- session.append_eos(output);
2965
- session.check_double_bos_eos(output);
2966
- }
2967
- } break;
2968
- case LLAMA_VOCAB_TYPE_WPM:
2969
- {
2970
- if (add_special) {
2971
- GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2972
- output.push_back(special_bos_id);
2973
- }
2974
-
2975
- llm_tokenizer_wpm_session session(vocab);
2976
-
2977
- for (const auto & fragment : fragment_buffer) {
2978
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2979
- std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2980
-
2981
- #ifdef PRETOKENIZERDEBUG
2982
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2983
- #endif
2984
- session.tokenize(text, output);
2985
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2986
- output.push_back(fragment.token);
2987
- }
2988
- }
2989
-
2990
- if (add_special) {
2991
- GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
2992
- output.push_back(special_sep_id);
2993
- }
2994
- } break;
2995
- case LLAMA_VOCAB_TYPE_UGM:
2996
- {
2997
- if (add_special && add_bos) {
2998
- GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2999
- output.push_back(special_bos_id);
3000
- }
3001
- llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
3002
-
3003
- for (const auto & fragment : fragment_buffer) {
3004
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3005
- std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3006
- #ifdef PRETOKENIZERDEBUG
3007
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3008
- #endif
3009
- session.tokenize(text, output);
3010
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3011
- output.push_back(fragment.token);
3012
- }
3013
- }
3014
-
3015
- if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
3016
- LLAMA_LOG_WARN(
3017
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
3018
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
3019
- "Are you sure this is what you want?\n", __FUNCTION__);
3020
- }
3021
-
3022
- if (add_special && add_eos) {
3023
- GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
3024
- output.push_back(special_eos_id);
3025
- }
3026
- } break;
3027
- case LLAMA_VOCAB_TYPE_RWKV:
3028
- {
3029
- llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
3030
- for (const auto & fragment : fragment_buffer) {
3031
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3032
- std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3033
-
3034
- #ifdef PRETOKENIZERDEBUG
3035
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3036
- #endif
3037
-
3038
- session.tokenize(text, output);
3039
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3040
- output.push_back(fragment.token);
3041
- }
3042
- }
3043
- } break;
3044
- case LLAMA_VOCAB_TYPE_PLAMO2:
3045
- {
3046
- llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
3047
- for (const auto & fragment : fragment_buffer) {
3048
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3049
- std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3050
-
3051
- #ifdef PRETOKENIZERDEBUG
3052
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3053
- #endif
3054
-
3055
- session.tokenize(text, output);
3056
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3057
- output.push_back(fragment.token);
3058
- }
3059
- }
3060
- } break;
3061
- case LLAMA_VOCAB_TYPE_NONE:
3062
- GGML_ABORT("fatal error");
3063
- }
3064
-
3065
- return output;
3066
- }
3067
-
3068
- int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3069
- // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
3070
- static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
3071
- const llama_token_attr attr = token_get_attr(token);
3072
- if (!special && (attr & attr_special)) {
3073
- return 0;
3074
- }
3075
-
3076
- // copy piece chars to output text buffer
3077
- // skip up to 'lstrip' leading spaces before copying
3078
- auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
3079
- if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3080
- GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
3081
- }
3082
-
3083
- for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
3084
- token++;
3085
- size--;
3086
- }
3087
- if (length < (int32_t)size) {
3088
- return -(int32_t) size;
3089
- }
3090
- memcpy(buf, token, size);
3091
- return (int32_t) size;
3092
- };
3093
-
3094
- // if we have a cache - use it
3095
- {
3096
- const auto & cache = cache_token_to_piece;
3097
-
3098
- if (!cache.empty()) {
3099
- const auto & result = cache.at(token);
3100
- return _try_copy(result.data(), result.size());
3101
- }
3102
- }
3103
-
3104
- if (0 <= token && token < (int32_t) id_to_token.size()) {
3105
- const std::string & token_text = id_to_token[token].text;
3106
- switch (get_type()) {
3107
- case LLAMA_VOCAB_TYPE_WPM:
3108
- case LLAMA_VOCAB_TYPE_SPM:
3109
- case LLAMA_VOCAB_TYPE_UGM: {
3110
- // NOTE: we accept all unsupported token types,
3111
- // suppressing them like CONTROL tokens.
3112
- if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3113
- return _try_copy(token_text.data(), token_text.size());
3114
- }
3115
- if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3116
- std::string result = token_text;
3117
- llama_unescape_whitespace(result);
3118
- return _try_copy(result.data(), result.size());
3119
- }
3120
- if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3121
- char byte = (char) token_to_byte(token);
3122
- return _try_copy((char*) &byte, 1);
3123
- }
3124
- break;
3125
- }
3126
- case LLAMA_VOCAB_TYPE_BPE: {
3127
- // NOTE: we accept all unsupported token types,
3128
- // suppressing them like CONTROL tokens.
3129
- if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3130
- return _try_copy(token_text.data(), token_text.size());
3131
- }
3132
- if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3133
- std::string result = llama_decode_text(token_text);
3134
- return _try_copy(result.data(), result.size());
3135
- }
3136
- break;
3137
- }
3138
- case LLAMA_VOCAB_TYPE_RWKV: {
3139
- std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
3140
-
3141
- // If we don't have enough space, return an error
3142
- if (result.size() > (size_t)length) {
3143
- return -(int)result.size();
3144
- }
3145
-
3146
- memcpy(buf, result.data(), result.size());
3147
- return (int)result.size();
3148
- }
3149
- case LLAMA_VOCAB_TYPE_PLAMO2: {
3150
- // PLaMo-2 uses similar token handling as BPE/SPM
3151
- if (vocab.is_byte(token)) {
3152
- // Handle byte tokens like <0xXX>
3153
- if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3154
- int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3155
- if (length < 1) {
3156
- return -1;
3157
- }
3158
- buf[0] = static_cast<char>(hex_val);
3159
- return 1;
3160
- }
3161
- }
3162
-
3163
- // Normal token - just copy the text
3164
- std::string result = token_text;
3165
- return _try_copy(result.data(), result.size());
3166
- }
3167
- default:
3168
- GGML_ABORT("fatal error");
3169
- }
3170
- }
3171
-
3172
- return 0;
3173
- }
3174
-
3175
- const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3176
- return cache_token_to_piece.at(token);
3177
- }
3178
-
3179
- int32_t llama_vocab::impl::detokenize(
3180
- const llama_token * tokens,
3181
- int32_t n_tokens,
3182
- char * text,
3183
- int32_t text_len_max,
3184
- bool remove_special,
3185
- bool unparse_special) const {
3186
- if (type == LLAMA_VOCAB_TYPE_NONE) {
3187
- return 0;
3188
- }
3189
-
3190
- GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3191
-
3192
- int32_t avail = text_len_max;
3193
- int32_t total = 0;
3194
-
3195
- // remove the leading space
3196
- bool remove_space = add_space_prefix;
3197
-
3198
- if (remove_special && add_bos) {
3199
- if (n_tokens > 0 && tokens[0] == special_bos_id) {
3200
- remove_space = false;
3201
- n_tokens--;
3202
- tokens++;
3203
- }
3204
- }
3205
-
3206
- if (remove_special && add_eos) {
3207
- if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
3208
- n_tokens--;
3209
- }
3210
- }
3211
-
3212
- for (int32_t i = 0; i < n_tokens; ++i) {
3213
- GGML_ASSERT(avail >= 0);
3214
- int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
3215
- remove_space = false;
3216
- if (n_chars < 0) {
3217
- avail = 0;
3218
- total -= n_chars;
3219
- } else if (n_chars > 0) {
3220
- avail -= n_chars;
3221
- text += n_chars;
3222
- total += n_chars;
3223
- }
3224
- }
3225
-
3226
- if (total > text_len_max) {
3227
- return -total;
3228
- }
3229
-
3230
- if (clean_spaces) {
3231
- text -= total; // restart text
3232
-
3233
- // first pass: characters ?!., //TODO: where do these characters come from?
3234
- const int32_t total1 = total;
3235
- total = total ? 1 : 0;
3236
- for (int32_t i = 1; i < total1; ++i) {
3237
- const char x = text[i];
3238
- if (text[i - 1] == ' ') {
3239
- if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
3240
- total--; // remove space
3241
- }
3242
- }
3243
- text[total++] = x;
3244
- }
3245
-
3246
- // second pass: strip single apostrophe between spaces
3247
- const int32_t total2 = total;
3248
- total = total ? 1 : 0;
3249
- for (int32_t i = 1; i < total2; ++i) {
3250
- const char x = text[i];
3251
- if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
3252
- total--; // remove prev space
3253
- text[++i] = '\0'; // remove next space
3254
- }
3255
- text[total++] = x;
3256
- }
3257
-
3258
- // third pass: apostrophe contractions //NOTE: this makes sense?
3259
- const int32_t total3 = total;
3260
- total = total ? 1 : 0;
3261
- for (int32_t i = 1; i < total3; ++i) {
3262
- const char x = text[i];
3263
- if (text[i - 1] == ' ') {
3264
- if (x == '\'' && i + 1 < total3) {
3265
- const char x1 = text[i + 1];
3266
- if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
3267
- //total--; // remove space
3268
- } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
3269
- total--; // remove space
3270
- } else if (i + 2 < total3) {
3271
- const char x2 = text[i + 2];
3272
- if ((x1 == 'l' && x2 == 'l')) { // " 'll"
3273
- //total--; // remove space
3274
- } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
3275
- total--; // remove space
3276
- } else {
3277
- //total--; // remove space
3278
- }
3279
- } else {
3280
- //total--; // remove space
3281
- }
3282
- }
3283
- }
3284
- text[total++] = x;
3285
- }
3286
- }
3287
-
3288
- return total <= text_len_max ? total : -total;
3289
- }
3290
-
3291
- void llama_vocab::impl::print_info() const {
3292
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3293
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3294
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3295
-
3296
- // special tokens
3297
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3298
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3299
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3300
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3301
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3302
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3303
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3304
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3305
-
3306
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3307
-
3308
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3309
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3310
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3311
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3312
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3313
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3314
-
3315
- for (const auto & id : special_eog_ids) {
3316
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3317
- }
3318
-
3319
- LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3320
- }
3321
-
3322
- llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3323
- }
3324
-
3325
- llama_vocab::~llama_vocab() = default;
3326
-
3327
- void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3328
- pimpl->load(ml, kv);
3329
- }
3330
-
3331
- std::string llama_vocab::get_tokenizer_model() const {
3332
- return pimpl->tokenizer_model;
3333
- }
3334
-
3335
- std::string llama_vocab::get_tokenizer_pre() const {
3336
- return pimpl->tokenizer_pre;
3337
- }
3338
-
3339
- enum llama_vocab_type llama_vocab::get_type() const {
3340
- return pimpl->type;
3341
- }
3342
-
3343
- enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3344
- return pimpl->pre_type;
3345
- }
3346
-
3347
- uint32_t llama_vocab::n_tokens() const {
3348
- return (uint32_t) pimpl->id_to_token.size();
3349
- }
3350
-
3351
- uint32_t llama_vocab::n_token_types() const {
3352
- return (uint32_t) pimpl->n_token_types;
3353
- }
3354
-
3355
- std::string llama_vocab::type_name() const{
3356
- return pimpl->type_name();
3357
- }
3358
-
3359
- bool llama_vocab::is_normal(llama_token id) const {
3360
- return pimpl->is_normal(id);
3361
- }
3362
-
3363
- bool llama_vocab::is_unknown(llama_token id) const {
3364
- return pimpl->is_unknown(id);
3365
- }
3366
-
3367
- bool llama_vocab::is_control(llama_token id) const {
3368
- return pimpl->is_control(id);
3369
- }
3370
-
3371
- bool llama_vocab::is_byte(llama_token id) const {
3372
- return pimpl->is_byte(id);
3373
- }
3374
-
3375
- bool llama_vocab::is_user_defined(llama_token id) const {
3376
- return pimpl->is_user_defined(id);
3377
- }
3378
-
3379
- bool llama_vocab::is_unused(llama_token id) const {
3380
- return pimpl->is_unused(id);
3381
- }
3382
-
3383
- bool llama_vocab::is_eog(llama_token id) const {
3384
- return pimpl->is_eog(id);
3385
- }
3386
-
3387
- uint8_t llama_vocab::token_to_byte(llama_token id) const {
3388
- return pimpl->token_to_byte(id);
3389
- }
3390
-
3391
- llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3392
- GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3393
- static const char * hex = "0123456789ABCDEF";
3394
- switch (get_type()) {
3395
- case LLAMA_VOCAB_TYPE_SPM:
3396
- case LLAMA_VOCAB_TYPE_UGM: {
3397
- const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
3398
- auto token = pimpl->token_to_id.find(buf);
3399
- if (token != pimpl->token_to_id.end()) {
3400
- return (*token).second;
3401
- }
3402
- // Try to fall back to just the byte as a string
3403
- const char buf2[2] = { (char)ch, 0 };
3404
- return pimpl->token_to_id.at(buf2);
3405
- }
3406
- case LLAMA_VOCAB_TYPE_WPM:
3407
- case LLAMA_VOCAB_TYPE_BPE: {
3408
- return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
3409
- }
3410
- case LLAMA_VOCAB_TYPE_PLAMO2: {
3411
- // PLaMo-2 uses byte tokens in format <0xXX>
3412
- char hex_str[8];
3413
- snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3414
- return pimpl->token_to_id.at(hex_str);
3415
- }
3416
- default:
3417
- GGML_ABORT("fatal error");
3418
- }
3419
- }
3420
-
3421
- llama_token llama_vocab::text_to_token(const std::string & text) const {
3422
- GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3423
- auto it = pimpl->token_to_id.find(text);
3424
- if (it != pimpl->token_to_id.end()) {
3425
- return (*it).second;
3426
- }
3427
- return LLAMA_TOKEN_NULL;
3428
- }
3429
-
3430
- const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3431
- GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3432
- return pimpl->id_to_token.at(id);
3433
- }
3434
-
3435
- const char * llama_vocab::token_get_text(llama_token id) const {
3436
- GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3437
- return pimpl->id_to_token.at(id).text.c_str();
3438
- }
3439
-
3440
- float llama_vocab::token_get_score(llama_token id) const {
3441
- GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3442
- return pimpl->id_to_token.at(id).score;
3443
- }
3444
-
3445
- llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3446
- return pimpl->token_get_attr(id);
3447
- }
3448
-
3449
- llama_token llama_vocab::token_bos() const {
3450
- return pimpl->special_bos_id;
3451
- }
3452
-
3453
- llama_token llama_vocab::token_eos() const {
3454
- return pimpl->special_eos_id;
3455
- }
3456
-
3457
- llama_token llama_vocab::token_eot() const {
3458
- return pimpl->special_eot_id;
3459
- }
3460
-
3461
- llama_token llama_vocab::token_eom() const {
3462
- return pimpl->special_eom_id;
3463
- }
3464
-
3465
- llama_token llama_vocab::token_unk() const {
3466
- return pimpl->special_unk_id;
3467
- }
3468
-
3469
- llama_token llama_vocab::token_sep() const {
3470
- return pimpl->special_sep_id;
3471
- }
3472
-
3473
- llama_token llama_vocab::token_nl() const {
3474
- return pimpl->linefeed_id;
3475
- }
3476
-
3477
- llama_token llama_vocab::token_pad() const {
3478
- return pimpl->special_pad_id;
3479
- }
3480
-
3481
- llama_token llama_vocab::token_prefix() const {
3482
- return pimpl->special_fim_pre_id;
3483
- }
3484
-
3485
- llama_token llama_vocab::token_middle() const {
3486
- return pimpl->special_fim_mid_id;
3487
- }
3488
-
3489
- llama_token llama_vocab::token_suffix() const {
3490
- return pimpl->special_fim_suf_id;
3491
- }
3492
-
3493
- llama_token llama_vocab::token_fim_pre() const {
3494
- return pimpl->special_fim_pre_id;
3495
- }
3496
-
3497
- llama_token llama_vocab::token_fim_suf() const {
3498
- return pimpl->special_fim_suf_id;
3499
- }
3500
-
3501
- llama_token llama_vocab::token_fim_mid() const {
3502
- return pimpl->special_fim_mid_id;
3503
- }
3504
-
3505
- llama_token llama_vocab::token_fim_pad() const {
3506
- return pimpl->special_fim_pad_id;
3507
- }
3508
-
3509
- llama_token llama_vocab::token_fim_rep() const {
3510
- return pimpl->special_fim_rep_id;
3511
- }
3512
-
3513
- llama_token llama_vocab::token_fim_sep() const {
3514
- return pimpl->special_fim_sep_id;
3515
- }
3516
-
3517
- llama_token llama_vocab::token_mask() const {
3518
- return pimpl->special_mask_id;
3519
- }
3520
-
3521
- bool llama_vocab::get_add_space_prefix() const {
3522
- return pimpl->add_space_prefix;
3523
- }
3524
-
3525
- bool llama_vocab::get_add_bos() const {
3526
- return pimpl->add_bos;
3527
- }
3528
-
3529
- bool llama_vocab::get_add_eos() const {
3530
- return pimpl->add_eos;
3531
- }
3532
-
3533
- bool llama_vocab::get_add_sep() const {
3534
- return pimpl->add_sep;
3535
- }
3536
-
3537
- bool llama_vocab::get_ignore_merges() const {
3538
- return pimpl->ignore_merges;
3539
- }
3540
-
3541
- bool llama_vocab::get_clean_spaces() const {
3542
- return pimpl->clean_spaces;
3543
- }
3544
-
3545
- bool llama_vocab::get_remove_extra_whitespaces() const {
3546
- return pimpl->remove_extra_whitespaces;
3547
- }
3548
-
3549
- bool llama_vocab::get_escape_whitespaces() const {
3550
- return pimpl->escape_whitespaces;
3551
- }
3552
-
3553
- bool llama_vocab::get_treat_whitespace_as_suffix() const {
3554
- return pimpl->treat_whitespace_as_suffix;
3555
- }
3556
-
3557
- int llama_vocab::max_token_len() const {
3558
- return pimpl->max_token_len;
3559
- }
3560
-
3561
- int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3562
- GGML_ASSERT(token_left.find(' ') == std::string::npos);
3563
- GGML_ASSERT(token_left.find('\n') == std::string::npos);
3564
- GGML_ASSERT(token_right.find(' ') == std::string::npos);
3565
- GGML_ASSERT(token_right.find('\n') == std::string::npos);
3566
-
3567
- auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
3568
- if (it == pimpl->bpe_ranks.end()) {
3569
- return -1;
3570
- }
3571
-
3572
- return it->second;
3573
- }
3574
-
3575
- std::vector<std::string> llama_vocab::get_bpe_merges() const {
3576
- std::vector<std::string> result(pimpl->bpe_ranks.size());
3577
-
3578
- for (const auto & pair : pimpl->bpe_ranks) {
3579
- result[pair.second] = pair.first.first + " " + pair.first.second;
3580
- }
3581
-
3582
- return result;
3583
- }
3584
-
3585
- std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3586
- return pimpl->precompiled_charsmap;
3587
- }
3588
-
3589
- int32_t llama_vocab::tokenize(
3590
- const char * text,
3591
- int32_t text_len,
3592
- llama_token * tokens,
3593
- int32_t n_tokens_max,
3594
- bool add_special,
3595
- bool parse_special) const {
3596
- auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3597
- if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3598
- LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3599
- return std::numeric_limits<int32_t>::min();
3600
- }
3601
-
3602
- if (n_tokens_max < (int) res.size()) {
3603
- // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3604
- return -((int) res.size());
3605
- }
3606
-
3607
- for (size_t i = 0; i < res.size(); i++) {
3608
- tokens[i] = res[i];
3609
- }
3610
-
3611
- return res.size();
3612
- }
3613
-
3614
- std::vector<llama_token> llama_vocab::tokenize(
3615
- const std::string & raw_text,
3616
- bool add_special,
3617
- bool parse_special) const {
3618
- return pimpl->tokenize(raw_text, add_special, parse_special);
3619
- }
3620
-
3621
- const std::string & llama_vocab::token_to_piece(llama_token token) const {
3622
- return pimpl->token_to_piece(token);
3623
- }
3624
-
3625
- int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3626
- return pimpl->token_to_piece(token, buf, length, lstrip, special);
3627
- }
3628
-
3629
- int32_t llama_vocab::detokenize(
3630
- const llama_token * tokens,
3631
- int32_t n_tokens,
3632
- char * text,
3633
- int32_t text_len_max,
3634
- bool remove_special,
3635
- bool unparse_special) const {
3636
- return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3637
- }
3638
-
3639
- std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3640
- std::string text;
3641
- text.resize(std::max(text.capacity(), tokens.size()));
3642
- int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3643
- if (n_chars < 0) {
3644
- text.resize(-n_chars);
3645
- n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3646
- GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
3647
- }
3648
-
3649
- text.resize(n_chars);
3650
-
3651
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
3652
- return text;
3653
- }
3654
-
3655
- void llama_vocab::print_info() const {
3656
- pimpl->print_info();
3657
- }
3658
-
3659
- //
3660
- // interface implementation
3661
- //
3662
-
3663
- int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3664
- return vocab->n_tokens();
3665
- }
3666
-
3667
- // deprecated
3668
- int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3669
- return llama_vocab_n_tokens(vocab);
3670
- }
3671
-
3672
- enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3673
- return vocab->get_type();
3674
- }
3675
-
3676
- const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3677
- return vocab->token_get_text(token);
3678
- }
3679
-
3680
- float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3681
- return vocab->token_get_score(token);
3682
- }
3683
-
3684
- enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3685
- return vocab->token_get_attr(token);
3686
- }
3687
-
3688
- bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3689
- return vocab->is_eog(token);
3690
- }
3691
-
3692
- bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3693
- return vocab->is_control(token);
3694
- }
3695
-
3696
- llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3697
- return vocab->token_bos();
3698
- }
3699
-
3700
- llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3701
- return vocab->token_eos();
3702
- }
3703
-
3704
- llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3705
- return vocab->token_eot();
3706
- }
3707
-
3708
- // deprecated
3709
- llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3710
- return vocab->token_bos();
3711
- }
3712
-
3713
- llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3714
- return vocab->token_sep();
3715
- }
3716
-
3717
- llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3718
- return vocab->token_nl();
3719
- }
3720
-
3721
- llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3722
- return vocab->token_pad();
3723
- }
3724
-
3725
- bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3726
- return vocab->get_add_bos();
3727
- }
3728
-
3729
- bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3730
- return vocab->get_add_eos();
3731
- }
3732
-
3733
- bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3734
- return vocab->get_add_sep();
3735
- }
3736
-
3737
- llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3738
- return vocab->token_fim_pre();
3739
- }
3740
-
3741
- llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3742
- return vocab->token_fim_suf();
3743
- }
3744
-
3745
- llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3746
- return vocab->token_fim_mid();
3747
- }
3748
-
3749
- llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3750
- return vocab->token_fim_pad();
3751
- }
3752
-
3753
- llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3754
- return vocab->token_fim_rep();
3755
- }
3756
-
3757
- llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3758
- return vocab->token_fim_sep();
3759
- }
3760
-
3761
- llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3762
- return vocab->token_mask();
3763
- }
3764
-
3765
- // deprecated
3766
- const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3767
- return llama_vocab_get_text(vocab, token);
3768
- }
3769
-
3770
- // deprecated
3771
- float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3772
- return llama_vocab_get_score(vocab, token);
3773
- }
3774
-
3775
- // deprecated
3776
- enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3777
- return llama_vocab_get_attr(vocab, token);
3778
- }
3779
-
3780
- // deprecated
3781
- bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3782
- return llama_vocab_is_eog(vocab, token);
3783
- }
3784
-
3785
- // deprecated
3786
- bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3787
- return llama_vocab_is_control(vocab, token);
3788
- }
3789
-
3790
- // deprecated
3791
- llama_token llama_token_bos(const struct llama_vocab * vocab) {
3792
- return llama_vocab_bos(vocab);
3793
- }
3794
-
3795
- // deprecated
3796
- llama_token llama_token_eos(const struct llama_vocab * vocab) {
3797
- return llama_vocab_eos(vocab);
3798
- }
3799
-
3800
- // deprecated
3801
- llama_token llama_token_eot(const struct llama_vocab * vocab) {
3802
- return llama_vocab_eot(vocab);
3803
- }
3804
-
3805
- // deprecated
3806
- llama_token llama_token_cls(const struct llama_vocab * vocab) {
3807
- //return llama_vocab_cls(vocab);
3808
- return llama_vocab_bos(vocab); // avoid deprecation warning
3809
- }
3810
-
3811
- // deprecated
3812
- llama_token llama_token_sep(const struct llama_vocab * vocab) {
3813
- return llama_vocab_sep(vocab);
3814
- }
3815
-
3816
- // deprecated
3817
- llama_token llama_token_nl (const struct llama_vocab * vocab) {
3818
- return llama_vocab_nl(vocab);
3819
- }
3820
-
3821
- // deprecated
3822
- llama_token llama_token_pad(const struct llama_vocab * vocab) {
3823
- return llama_vocab_pad(vocab);
3824
- }
3825
-
3826
- // deprecated
3827
- bool llama_add_bos_token(const struct llama_vocab * vocab) {
3828
- return llama_vocab_get_add_bos(vocab);
3829
- }
3830
-
3831
- // deprecated
3832
- bool llama_add_eos_token(const struct llama_vocab * vocab) {
3833
- return llama_vocab_get_add_eos(vocab);
3834
- }
3835
-
3836
- // deprecated
3837
- llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3838
- return llama_vocab_fim_pre(vocab);
3839
- }
3840
-
3841
- // deprecated
3842
- llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3843
- return llama_vocab_fim_suf(vocab);
3844
- }
3845
-
3846
- // deprecated
3847
- llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3848
- return llama_vocab_fim_mid(vocab);
3849
- }
3850
-
3851
- // deprecated
3852
- llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3853
- return llama_vocab_fim_pad(vocab);
3854
- }
3855
-
3856
- // deprecated
3857
- llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3858
- return llama_vocab_fim_rep(vocab);
3859
- }
3860
-
3861
- // deprecated
3862
- llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3863
- return llama_vocab_fim_sep(vocab);
3864
- }
3865
-
3866
- //
3867
- // tokenization
3868
- //
3869
-
3870
- int32_t llama_tokenize(
3871
- const struct llama_vocab * vocab,
3872
- const char * text,
3873
- int32_t text_len,
3874
- llama_token * tokens,
3875
- int32_t n_tokens_max,
3876
- bool add_special,
3877
- bool parse_special) {
3878
- return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3879
- }
3880
-
3881
- int32_t llama_token_to_piece(
3882
- const struct llama_vocab * vocab,
3883
- llama_token token,
3884
- char * buf,
3885
- int32_t length,
3886
- int32_t lstrip,
3887
- bool special) {
3888
- return vocab->token_to_piece(token, buf, length, lstrip, special);
3889
- }
3890
-
3891
- int32_t llama_detokenize(
3892
- const struct llama_vocab * vocab,
3893
- const llama_token * tokens,
3894
- int32_t n_tokens,
3895
- char * text,
3896
- int32_t text_len_max,
3897
- bool remove_special,
3898
- bool unparse_special) {
3899
- return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3900
- }