whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -9,16 +9,17 @@
9
9
 
10
10
  #include <algorithm>
11
11
  #include <cassert>
12
+ #include <cctype>
12
13
  #include <cfloat>
13
- #include <climits>
14
+ #include <cmath>
14
15
  #include <cstdarg>
15
16
  #include <cstring>
16
17
  #include <forward_list>
18
+ #include <limits>
17
19
  #include <map>
18
20
  #include <queue>
19
21
  #include <set>
20
22
  #include <unordered_map>
21
- #include <cctype>
22
23
 
23
24
  //
24
25
  // helpers
@@ -306,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
306
307
  };
307
308
  break;
308
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
309
311
  regex_exprs = {
310
312
  "\\p{N}{1,3}",
311
313
  "[一-龥぀-ゟ゠-ヿ]+",
@@ -351,6 +353,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
351
353
  break;
352
354
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
353
355
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
356
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
354
357
  regex_exprs = {
355
358
  // original regex from tokenizer.json
356
359
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -403,6 +406,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
403
406
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
404
407
  };
405
408
  break;
409
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
410
+ regex_exprs = {
411
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
412
+ // The custom handler implements all K2 patterns with proper Han character exclusion
413
+ "\\p{Han}+",
414
+ };
415
+ break;
406
416
  case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
407
417
  regex_exprs = {
408
418
  "\\p{N}+",
@@ -424,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
424
434
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
425
435
  };
426
436
  break;
437
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
438
+ regex_exprs = {
439
+ // original regex from tokenizer.json
440
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
441
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
442
+ };
443
+ break;
427
444
  default:
428
445
  // default regex for BPE tokenization pre-processing
429
446
  regex_exprs = {
@@ -1195,6 +1212,284 @@ private:
1195
1212
  const llm_tokenizer_rwkv & tokenizer;
1196
1213
  };
1197
1214
 
1215
+ struct llm_tokenizer_plamo2 : llm_tokenizer {
1216
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
1217
+ build(vocab);
1218
+ }
1219
+
1220
+ void build(const llama_vocab & vocab) {
1221
+ // Reset internal structures
1222
+ tokens_.clear();
1223
+ bytes_.assign(256, 0);
1224
+ to_suffix_id_.clear();
1225
+ table_.clear();
1226
+
1227
+ // Build token list and byte mapping
1228
+ std::unordered_map<std::string, float> suffix_to_score;
1229
+ std::unordered_map<std::string, llama_token> token_to_id;
1230
+
1231
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1232
+ const auto & entry = vocab.get_token_data(token_id);
1233
+ tokens_.push_back(entry.text);
1234
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
1235
+
1236
+ // Handle byte tokens
1237
+ if (vocab.is_byte(token_id)) {
1238
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1239
+ std::string hex_str = entry.text.substr(3, 2);
1240
+ int byte_val = std::stoi(hex_str, nullptr, 16);
1241
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
1242
+ }
1243
+ continue;
1244
+ }
1245
+
1246
+ // Add token and all its suffixes to suffix_to_score
1247
+ suffix_to_score[entry.text] = entry.score;
1248
+
1249
+ // Extract suffixes character by character (UTF-8 aware)
1250
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1251
+ for (size_t i = 1; i < cpts.size(); ++i) {
1252
+ std::string suffix;
1253
+ for (size_t j = i; j < cpts.size(); ++j) {
1254
+ suffix += unicode_cpt_to_utf8(cpts[j]);
1255
+ }
1256
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1257
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1258
+ }
1259
+ }
1260
+ }
1261
+
1262
+ // Check that all byte tokens are set
1263
+ for (int i = 0; i < 256; ++i) {
1264
+ if (bytes_[i] == 0) {
1265
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1266
+ }
1267
+ }
1268
+
1269
+ // Build suffix list in lexicographical order of reversed strings
1270
+ std::vector<std::string> suffixes;
1271
+ for (const auto & pair : suffix_to_score) {
1272
+ suffixes.push_back(pair.first);
1273
+ }
1274
+ suffixes.push_back(""); // Empty suffix
1275
+
1276
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1277
+ std::string rev_a(a.rbegin(), a.rend());
1278
+ std::string rev_b(b.rbegin(), b.rend());
1279
+ return rev_a < rev_b;
1280
+ });
1281
+
1282
+ // Build suffix_to_id and to_suffix_id_
1283
+ std::unordered_map<std::string, int32_t> suffix_to_id;
1284
+ int32_t num_pieces = 0;
1285
+
1286
+ for (const auto & suffix : suffixes) {
1287
+ suffix_to_id[suffix] = num_pieces;
1288
+ if (!suffix.empty()) {
1289
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1290
+
1291
+ std::string remaining;
1292
+ for (size_t i = 1; i < cpts.size(); ++i) {
1293
+ remaining += unicode_cpt_to_utf8(cpts[i]);
1294
+ }
1295
+
1296
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1297
+ to_suffix_id_[piece_code] = num_pieces;
1298
+
1299
+ // Count number of pieces for this suffix
1300
+ int32_t pieces_for_suffix = 1; // sentinel row
1301
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1302
+ std::string piece;
1303
+ for (int32_t i = 0; i < piece_length; ++i) {
1304
+ piece += unicode_cpt_to_utf8(cpts[i]);
1305
+ }
1306
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1307
+ pieces_for_suffix++;
1308
+ }
1309
+ }
1310
+ num_pieces += pieces_for_suffix;
1311
+ } else {
1312
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
1313
+ }
1314
+ }
1315
+
1316
+ // Build flattened table
1317
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1318
+ int32_t table_idx = 0;
1319
+
1320
+ for (const auto & suffix : suffixes) {
1321
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
1322
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1323
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1324
+ std::string piece;
1325
+ for (int32_t i = 0; i < piece_length; ++i) {
1326
+ piece += unicode_cpt_to_utf8(cpts[i]);
1327
+ }
1328
+
1329
+ auto score_it = suffix_to_score.find(piece);
1330
+ if (score_it == suffix_to_score.end()) {
1331
+ continue;
1332
+ }
1333
+
1334
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1335
+ auto token_it = token_to_id.find(piece);
1336
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1337
+
1338
+ float score = score_it->second;
1339
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1340
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1341
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1342
+
1343
+ table_idx++;
1344
+ }
1345
+
1346
+ // Add sentinel row
1347
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1348
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
1349
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1350
+ table_idx++;
1351
+ }
1352
+ }
1353
+
1354
+ std::vector<llama_token> encode(const std::string & text) const {
1355
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1356
+ // Skip the first code point if it is a BOM (Byte Order Mark)
1357
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1358
+ unicode_data.erase(unicode_data.begin());
1359
+ }
1360
+
1361
+ if (unicode_data.empty()) {
1362
+ return {};
1363
+ }
1364
+
1365
+ const size_t data_len = unicode_data.size();
1366
+
1367
+ // Initialize scores array (dynamic programming)
1368
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1369
+ scores[data_len] = 0;
1370
+
1371
+ // Path array to track best tokenization
1372
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1373
+
1374
+ int32_t suffix_id = 0;
1375
+
1376
+ // Process from end to beginning
1377
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1378
+ uint32_t c = unicode_data[i];
1379
+
1380
+ // Find next suffix ID
1381
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1382
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1383
+ auto it = to_suffix_id_.find(piece_code);
1384
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1385
+
1386
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1387
+ break;
1388
+ }
1389
+ }
1390
+
1391
+ // Update best path
1392
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1393
+ int32_t score = table_[p][TABLE_SCORE];
1394
+ if (score > INVALID_SCORE) {
1395
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1396
+ int64_t s = scores[i + piece_length] - score;
1397
+
1398
+ if (s < scores[i]) {
1399
+ scores[i] = s;
1400
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
1401
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1402
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1403
+
1404
+ if (score == UNKNOWN_SCORE) {
1405
+ // Add UTF-8 byte count
1406
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1407
+ }
1408
+ }
1409
+ }
1410
+
1411
+ if (score == UNKNOWN_SCORE) {
1412
+ break;
1413
+ }
1414
+ }
1415
+ }
1416
+
1417
+ // Decode the best path
1418
+ std::vector<llama_token> token_ids;
1419
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1420
+
1421
+ int pos = 0;
1422
+ while (pos < static_cast<int>(data_len)) {
1423
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
1424
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1425
+ } else {
1426
+ // Fall back to byte tokens
1427
+ uint32_t c = unicode_data[pos];
1428
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1429
+
1430
+ for (int i = 0; i < s; ++i) {
1431
+ uint8_t b;
1432
+ if (s == 1) {
1433
+ b = c;
1434
+ } else {
1435
+ if (i == 0) {
1436
+ b = (0xF00 >> s) & 0xFF;
1437
+ } else {
1438
+ b = 0x80;
1439
+ }
1440
+ }
1441
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1442
+ }
1443
+ }
1444
+
1445
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1446
+ pos += path[pos][PATH_TOKEN_LENGTH];
1447
+ }
1448
+
1449
+ return token_ids;
1450
+ }
1451
+ private:
1452
+ // Constants for table structure
1453
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1454
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
1455
+ static constexpr int32_t TABLE_SCORE = 2;
1456
+ static constexpr int32_t TABLE_PIECE_ID = 3;
1457
+
1458
+ // Constants for path array
1459
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1460
+ static constexpr int32_t PATH_TOKEN_ID = 1;
1461
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
1462
+
1463
+ // Score constants
1464
+ static constexpr int32_t INVALID_SCORE = -20000000;
1465
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
1466
+
1467
+ // List of tokens in the vocabulary
1468
+ std::vector<std::string> tokens_;
1469
+
1470
+ // Mapping from byte code point to token ID (for byte fallback)
1471
+ std::vector<llama_token> bytes_;
1472
+
1473
+ // Mapping from piece code to suffix ID
1474
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
1475
+
1476
+ // Flattened table representing the Trie structure
1477
+ // Each row contains: [piece_length, token_id, score, piece_id]
1478
+ std::vector<std::vector<int32_t>> table_;
1479
+ };
1480
+
1481
+ struct llm_tokenizer_plamo2_session {
1482
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1483
+
1484
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1485
+ std::vector<llama_token> tokens = tokenizer.encode(text);
1486
+ output.insert(output.end(), tokens.begin(), tokens.end());
1487
+ }
1488
+
1489
+ private:
1490
+ const llm_tokenizer_plamo2 & tokenizer;
1491
+ };
1492
+
1198
1493
  //
1199
1494
  // impl
1200
1495
  //
@@ -1269,6 +1564,7 @@ struct llama_vocab::impl {
1269
1564
  bool add_space_prefix = false;
1270
1565
  bool add_bos = false;
1271
1566
  bool add_eos = false;
1567
+ bool add_sep = false;
1272
1568
  bool ignore_merges = false;
1273
1569
  bool clean_spaces = false; // clean_up_tokenization_spaces
1274
1570
  bool remove_extra_whitespaces = false;
@@ -1421,6 +1717,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1421
1717
  special_sep_id = 102;
1422
1718
  special_pad_id = 0;
1423
1719
  special_mask_id = 103;
1720
+
1721
+ add_sep = true;
1424
1722
  } else if (tokenizer_model == "gpt2") {
1425
1723
  type = LLAMA_VOCAB_TYPE_BPE;
1426
1724
 
@@ -1474,7 +1772,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1474
1772
  const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1475
1773
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1476
1774
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1477
- #ifdef IS_BIG_ENDIAN
1775
+ #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1478
1776
  // correct endiannes of data in precompiled_charsmap binary blob
1479
1777
  uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1480
1778
  *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@@ -1495,6 +1793,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1495
1793
  special_unk_id = LLAMA_TOKEN_NULL;
1496
1794
  special_sep_id = LLAMA_TOKEN_NULL;
1497
1795
  special_pad_id = LLAMA_TOKEN_NULL;
1796
+ } else if (tokenizer_model == "plamo2") {
1797
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
1798
+
1799
+ // PLaMo-2 default special tokens (these will be overridden by model config)
1800
+ special_bos_id = 1; // <|plamo:bos|>
1801
+ special_eos_id = 2; // <|plamo:eos|>
1802
+ special_unk_id = 0; // <|plamo:unk|>
1803
+ special_sep_id = LLAMA_TOKEN_NULL;
1804
+ special_pad_id = 3; // <|plamo:pad|>
1805
+ special_mask_id = LLAMA_TOKEN_NULL;
1498
1806
  } else {
1499
1807
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1500
1808
  }
@@ -1519,7 +1827,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1519
1827
  tokenizer_pre == "llama-v3" ||
1520
1828
  tokenizer_pre == "llama-bpe"||
1521
1829
  tokenizer_pre == "falcon3" ||
1522
- tokenizer_pre == "pixtral") {
1830
+ tokenizer_pre == "falcon-h1" ||
1831
+ tokenizer_pre == "pixtral" ||
1832
+ tokenizer_pre == "midm-2.0" ||
1833
+ tokenizer_pre == "lfm2") {
1523
1834
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1524
1835
  ignore_merges = true;
1525
1836
  add_bos = true;
@@ -1550,12 +1861,17 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1550
1861
  tokenizer_pre == "jina-es" ||
1551
1862
  tokenizer_pre == "jina-de" ||
1552
1863
  tokenizer_pre == "gigachat" ||
1553
- tokenizer_pre == "jina-v1-en" ||
1554
1864
  tokenizer_pre == "jina-v2-es" ||
1555
1865
  tokenizer_pre == "jina-v2-de" ||
1866
+ tokenizer_pre == "a.x-4.0" ||
1867
+ tokenizer_pre == "mellum") {
1868
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1869
+ } else if (
1870
+ tokenizer_pre == "jina-v1-en" ||
1556
1871
  tokenizer_pre == "jina-v2-code" ||
1557
1872
  tokenizer_pre == "roberta-bpe") {
1558
1873
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1874
+ add_sep = true;
1559
1875
  } else if (
1560
1876
  tokenizer_pre == "refact") {
1561
1877
  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1618,6 +1934,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1618
1934
  } else if (
1619
1935
  tokenizer_pre == "exaone") {
1620
1936
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1937
+ } else if (
1938
+ tokenizer_pre == "exaone4") {
1939
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1621
1940
  } else if (
1622
1941
  tokenizer_pre == "chameleon") {
1623
1942
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -1643,13 +1962,30 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1643
1962
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1644
1963
  clean_spaces = false;
1645
1964
  } else if (
1646
- tokenizer_pre == "bailingmoe") {
1965
+ tokenizer_pre == "bailingmoe" ||
1966
+ tokenizer_pre == "llada-moe") {
1647
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1648
1968
  clean_spaces = false;
1649
1969
  } else if (
1650
1970
  tokenizer_pre == "seed-coder") {
1651
1971
  pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1652
1972
  clean_spaces = false;
1973
+ } else if (
1974
+ tokenizer_pre == "hunyuan") {
1975
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1976
+ clean_spaces = false;
1977
+ } else if (
1978
+ tokenizer_pre == "hunyuan-dense") {
1979
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1980
+ clean_spaces = false;
1981
+ } else if (
1982
+ tokenizer_pre == "kimi-k2") {
1983
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1984
+ clean_spaces = false;
1985
+ } else if (
1986
+ tokenizer_pre == "grok-2") {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1988
+ clean_spaces = false;
1653
1989
  } else {
1654
1990
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1655
1991
  }
@@ -1665,6 +2001,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1665
2001
  clean_spaces = true;
1666
2002
  add_bos = true;
1667
2003
  add_eos = false;
2004
+ add_sep = true;
1668
2005
  } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1669
2006
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1670
2007
  add_bos = false;
@@ -1801,7 +2138,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1801
2138
  }
1802
2139
  }
1803
2140
 
1804
- // Handle add_bos and add_eos
2141
+ // Handle add_bos, add_eos and add_sep
1805
2142
  {
1806
2143
  bool temp = true;
1807
2144
 
@@ -1811,6 +2148,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
2148
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1812
2149
  add_eos = temp;
1813
2150
  }
2151
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
2152
+ add_sep = temp;
2153
+ }
1814
2154
  }
1815
2155
 
1816
2156
  // auto-detect special tokens by text
@@ -1829,6 +2169,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1829
2169
  || t.first == "<EOT>"
1830
2170
  || t.first == "_<EOT>"
1831
2171
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
2172
+ || t.first == "<end_of_utterance>" // smoldocling
1832
2173
  ) {
1833
2174
  special_eot_id = t.second;
1834
2175
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1862,6 +2203,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1862
2203
  || t.first == "<|fim▁begin|>" // DeepSeek
1863
2204
  || t.first == "<PRE>"
1864
2205
  || t.first == "▁<PRE>" // CodeLlama
2206
+ || t.first == "<|code_prefix|>" // GLM-4.5
1865
2207
  ) {
1866
2208
  special_fim_pre_id = t.second;
1867
2209
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1881,6 +2223,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1881
2223
  || t.first == "<|fim▁hole|>" // DeepSeek
1882
2224
  || t.first == "<SUF>"
1883
2225
  || t.first == "▁<SUF>" // CodeLlama
2226
+ || t.first == "<|code_suffix|>" // GLM-4.5
1884
2227
  ) {
1885
2228
  special_fim_suf_id = t.second;
1886
2229
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1900,6 +2243,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1900
2243
  || t.first == "<|fim▁end|>" // DeepSeek
1901
2244
  || t.first == "<MID>"
1902
2245
  || t.first == "▁<MID>" // CodeLlama
2246
+ || t.first == "<|code_middle|>" // GLM-4.5
1903
2247
  ) {
1904
2248
  special_fim_mid_id = t.second;
1905
2249
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1982,11 +2326,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1982
2326
  || t.first == "<|eot_id|>"
1983
2327
  || t.first == "<|im_end|>"
1984
2328
  || t.first == "<|end|>"
2329
+ || t.first == "<|return|>" // o200k_harmony
2330
+ || t.first == "<|call|>" // o200k_harmony
1985
2331
  || t.first == "<end_of_turn>"
1986
2332
  || t.first == "<|endoftext|>"
1987
2333
  || t.first == "<|eom_id|>"
1988
2334
  || t.first == "<EOT>"
1989
2335
  || t.first == "_<EOT>"
2336
+ || t.first == "<|end_of_text|>"
2337
+ || t.first == "<end_of_utterance>" // smoldocling
1990
2338
  ) {
1991
2339
  special_eog_ids.insert(t.second);
1992
2340
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2003,6 +2351,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2003
2351
  }
2004
2352
  }
2005
2353
 
2354
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2355
+ for (const auto & t : token_to_id) {
2356
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2357
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2358
+ }
2359
+ }
2360
+
2006
2361
  // sanity checks
2007
2362
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2008
2363
  special_eog_ids.insert(special_eos_id);
@@ -2018,6 +2373,37 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2018
2373
  special_eog_ids.insert(special_eom_id);
2019
2374
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2020
2375
  }
2376
+
2377
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2378
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2379
+ // we remove the "<|end|>" token from the EOG list
2380
+ {
2381
+ bool has_return = false;
2382
+ bool has_call = false;
2383
+ bool has_end = false;
2384
+
2385
+ llama_token end_id = LLAMA_TOKEN_NULL;
2386
+
2387
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2388
+ for (auto tid : special_eog_ids) {
2389
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2390
+
2391
+ if (id_to_token[tid].text == "<|return|>") {
2392
+ has_return = true;
2393
+ } else if (id_to_token[tid].text == "<|call|>") {
2394
+ has_call = true;
2395
+ } else if (id_to_token[tid].text == "<|end|>") {
2396
+ has_end = true;
2397
+ end_id = tid;
2398
+ }
2399
+ }
2400
+
2401
+ if (has_return && has_call && has_end) {
2402
+ special_eog_ids.erase(end_id);
2403
+ id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2404
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2405
+ }
2406
+ }
2021
2407
  }
2022
2408
 
2023
2409
  // build special tokens cache
@@ -2059,9 +2445,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2059
2445
  //NOTE: Per token attributes are missing from the GGUF file.
2060
2446
  //TODO: Extract attributes from GGUF file.
2061
2447
  {
2062
- auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
2448
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2063
2449
  for (const auto & substr : substrs) {
2064
- if (str.find(substr) < std::string::npos) {
2450
+ if (str.find(substr) != std::string::npos) {
2065
2451
  return true;
2066
2452
  }
2067
2453
  }
@@ -2080,9 +2466,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2080
2466
 
2081
2467
  std::string model_name;
2082
2468
  std::string tokenizer_pre;
2469
+ std::string general_arch;
2083
2470
 
2084
2471
  ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2085
2472
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2473
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2086
2474
 
2087
2475
  // model name to lowercase
2088
2476
  std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,9 +2479,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2091
2479
  }
2092
2480
  );
2093
2481
 
2094
- // set attributes by model/tokenizer name
2095
- if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2096
- _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2482
+ // set attributes by model/tokenizer/architecture name
2483
+ if (false
2484
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2485
+ || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2486
+ ) {
2487
+ if (token_to_id.count("<mask>") == 0) {
2488
+ LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2489
+ } else {
2490
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2491
+ }
2097
2492
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2098
2493
  for (auto id : cache_special_tokens) {
2099
2494
  _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
@@ -2114,13 +2509,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
2114
2509
 
2115
2510
  std::string llama_vocab::impl::type_name() const{
2116
2511
  switch (type) {
2117
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2118
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2119
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2120
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2121
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2122
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2123
- default: return "unknown";
2512
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2513
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2514
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2515
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2516
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2517
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2518
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2519
+ default: return "unknown";
2124
2520
  }
2125
2521
  }
2126
2522
 
@@ -2203,6 +2599,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2203
2599
  case LLAMA_VOCAB_TYPE_RWKV:
2204
2600
  tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2205
2601
  break;
2602
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2603
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2604
+ break;
2206
2605
  default:
2207
2606
  GGML_ABORT("unsupported vocab type");
2208
2607
  }
@@ -2535,6 +2934,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2535
2934
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2536
2935
  std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2537
2936
 
2937
+ #ifdef PRETOKENIZERDEBUG
2938
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2939
+ #endif
2940
+
2941
+ session.tokenize(text, output);
2942
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2943
+ output.push_back(fragment.token);
2944
+ }
2945
+ }
2946
+ } break;
2947
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2948
+ {
2949
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2950
+ for (const auto & fragment : fragment_buffer) {
2951
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2952
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2953
+
2538
2954
  #ifdef PRETOKENIZERDEBUG
2539
2955
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2540
2956
  #endif
@@ -2563,6 +2979,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2563
2979
  // copy piece chars to output text buffer
2564
2980
  // skip up to 'lstrip' leading spaces before copying
2565
2981
  auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2982
+ if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2983
+ GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2984
+ }
2985
+
2566
2986
  for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
2567
2987
  token++;
2568
2988
  size--;
@@ -2629,6 +3049,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2629
3049
  memcpy(buf, result.data(), result.size());
2630
3050
  return (int)result.size();
2631
3051
  }
3052
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3053
+ // PLaMo-2 uses similar token handling as BPE/SPM
3054
+ if (vocab.is_byte(token)) {
3055
+ // Handle byte tokens like <0xXX>
3056
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3057
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3058
+ if (length < 1) {
3059
+ return -1;
3060
+ }
3061
+ buf[0] = static_cast<char>(hex_val);
3062
+ return 1;
3063
+ }
3064
+ }
3065
+
3066
+ // Normal token - just copy the text
3067
+ std::string result = token_text;
3068
+ return _try_copy(result.data(), result.size());
3069
+ }
2632
3070
  default:
2633
3071
  GGML_ABORT("fatal error");
2634
3072
  }
@@ -2759,26 +3197,26 @@ void llama_vocab::impl::print_info() const {
2759
3197
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
2760
3198
 
2761
3199
  // special tokens
2762
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
2763
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
2764
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
2765
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
2766
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
2767
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
2768
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
2769
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
2770
-
2771
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
2772
-
2773
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
2774
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
2775
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
2776
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
2777
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
2778
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
3200
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3201
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3202
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3203
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3204
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3205
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3206
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3207
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3208
+
3209
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3210
+
3211
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3212
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3213
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3214
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3215
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3216
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
2779
3217
 
2780
3218
  for (const auto & id : special_eog_ids) {
2781
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
3219
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
2782
3220
  }
2783
3221
 
2784
3222
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
@@ -2873,6 +3311,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
2873
3311
  case LLAMA_VOCAB_TYPE_BPE: {
2874
3312
  return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
2875
3313
  }
3314
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3315
+ // PLaMo-2 uses byte tokens in format <0xXX>
3316
+ char hex_str[8];
3317
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3318
+ return pimpl->token_to_id.at(hex_str);
3319
+ }
2876
3320
  default:
2877
3321
  GGML_ABORT("fatal error");
2878
3322
  }
@@ -2974,6 +3418,10 @@ llama_token llama_vocab::token_fim_sep() const {
2974
3418
  return pimpl->special_fim_sep_id;
2975
3419
  }
2976
3420
 
3421
+ llama_token llama_vocab::token_mask() const {
3422
+ return pimpl->special_mask_id;
3423
+ }
3424
+
2977
3425
  bool llama_vocab::get_add_space_prefix() const {
2978
3426
  return pimpl->add_space_prefix;
2979
3427
  }
@@ -2986,6 +3434,10 @@ bool llama_vocab::get_add_eos() const {
2986
3434
  return pimpl->add_eos;
2987
3435
  }
2988
3436
 
3437
+ bool llama_vocab::get_add_sep() const {
3438
+ return pimpl->add_sep;
3439
+ }
3440
+
2989
3441
  bool llama_vocab::get_ignore_merges() const {
2990
3442
  return pimpl->ignore_merges;
2991
3443
  }
@@ -3046,6 +3498,11 @@ int32_t llama_vocab::tokenize(
3046
3498
  bool add_special,
3047
3499
  bool parse_special) const {
3048
3500
  auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3501
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3502
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3503
+ return std::numeric_limits<int32_t>::min();
3504
+ }
3505
+
3049
3506
  if (n_tokens_max < (int) res.size()) {
3050
3507
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3051
3508
  return -((int) res.size());
@@ -3177,6 +3634,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3177
3634
  return vocab->get_add_eos();
3178
3635
  }
3179
3636
 
3637
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3638
+ return vocab->get_add_sep();
3639
+ }
3640
+
3180
3641
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3181
3642
  return vocab->token_fim_pre();
3182
3643
  }
@@ -3201,6 +3662,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3201
3662
  return vocab->token_fim_sep();
3202
3663
  }
3203
3664
 
3665
+ llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3666
+ return vocab->token_mask();
3667
+ }
3668
+
3204
3669
  // deprecated
3205
3670
  const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3206
3671
  return llama_vocab_get_text(vocab, token);
@@ -3337,4 +3802,3 @@ int32_t llama_detokenize(
3337
3802
  bool unparse_special) {
3338
3803
  return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3339
3804
  }
3340
-