whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -11,6 +11,7 @@
11
11
  #include <cassert>
12
12
  #include <cctype>
13
13
  #include <cfloat>
14
+ #include <cmath>
14
15
  #include <cstdarg>
15
16
  #include <cstring>
16
17
  #include <forward_list>
@@ -306,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
306
307
  };
307
308
  break;
308
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
309
311
  regex_exprs = {
310
312
  "\\p{N}{1,3}",
311
313
  "[一-龥぀-ゟ゠-ヿ]+",
@@ -351,6 +353,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
351
353
  break;
352
354
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
353
355
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
356
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
354
357
  regex_exprs = {
355
358
  // original regex from tokenizer.json
356
359
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -403,6 +406,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
403
406
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
404
407
  };
405
408
  break;
409
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
410
+ regex_exprs = {
411
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
412
+ // The custom handler implements all K2 patterns with proper Han character exclusion
413
+ "\\p{Han}+",
414
+ };
415
+ break;
406
416
  case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
407
417
  regex_exprs = {
408
418
  "\\p{N}+",
@@ -424,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
424
434
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
425
435
  };
426
436
  break;
437
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
438
+ regex_exprs = {
439
+ // original regex from tokenizer.json
440
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
441
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
442
+ };
443
+ break;
427
444
  default:
428
445
  // default regex for BPE tokenization pre-processing
429
446
  regex_exprs = {
@@ -1195,6 +1212,284 @@ private:
1195
1212
  const llm_tokenizer_rwkv & tokenizer;
1196
1213
  };
1197
1214
 
1215
+ struct llm_tokenizer_plamo2 : llm_tokenizer {
1216
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
1217
+ build(vocab);
1218
+ }
1219
+
1220
+ void build(const llama_vocab & vocab) {
1221
+ // Reset internal structures
1222
+ tokens_.clear();
1223
+ bytes_.assign(256, 0);
1224
+ to_suffix_id_.clear();
1225
+ table_.clear();
1226
+
1227
+ // Build token list and byte mapping
1228
+ std::unordered_map<std::string, float> suffix_to_score;
1229
+ std::unordered_map<std::string, llama_token> token_to_id;
1230
+
1231
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1232
+ const auto & entry = vocab.get_token_data(token_id);
1233
+ tokens_.push_back(entry.text);
1234
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
1235
+
1236
+ // Handle byte tokens
1237
+ if (vocab.is_byte(token_id)) {
1238
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1239
+ std::string hex_str = entry.text.substr(3, 2);
1240
+ int byte_val = std::stoi(hex_str, nullptr, 16);
1241
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
1242
+ }
1243
+ continue;
1244
+ }
1245
+
1246
+ // Add token and all its suffixes to suffix_to_score
1247
+ suffix_to_score[entry.text] = entry.score;
1248
+
1249
+ // Extract suffixes character by character (UTF-8 aware)
1250
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1251
+ for (size_t i = 1; i < cpts.size(); ++i) {
1252
+ std::string suffix;
1253
+ for (size_t j = i; j < cpts.size(); ++j) {
1254
+ suffix += unicode_cpt_to_utf8(cpts[j]);
1255
+ }
1256
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1257
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1258
+ }
1259
+ }
1260
+ }
1261
+
1262
+ // Check that all byte tokens are set
1263
+ for (int i = 0; i < 256; ++i) {
1264
+ if (bytes_[i] == 0) {
1265
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1266
+ }
1267
+ }
1268
+
1269
+ // Build suffix list in lexicographical order of reversed strings
1270
+ std::vector<std::string> suffixes;
1271
+ for (const auto & pair : suffix_to_score) {
1272
+ suffixes.push_back(pair.first);
1273
+ }
1274
+ suffixes.push_back(""); // Empty suffix
1275
+
1276
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1277
+ std::string rev_a(a.rbegin(), a.rend());
1278
+ std::string rev_b(b.rbegin(), b.rend());
1279
+ return rev_a < rev_b;
1280
+ });
1281
+
1282
+ // Build suffix_to_id and to_suffix_id_
1283
+ std::unordered_map<std::string, int32_t> suffix_to_id;
1284
+ int32_t num_pieces = 0;
1285
+
1286
+ for (const auto & suffix : suffixes) {
1287
+ suffix_to_id[suffix] = num_pieces;
1288
+ if (!suffix.empty()) {
1289
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1290
+
1291
+ std::string remaining;
1292
+ for (size_t i = 1; i < cpts.size(); ++i) {
1293
+ remaining += unicode_cpt_to_utf8(cpts[i]);
1294
+ }
1295
+
1296
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1297
+ to_suffix_id_[piece_code] = num_pieces;
1298
+
1299
+ // Count number of pieces for this suffix
1300
+ int32_t pieces_for_suffix = 1; // sentinel row
1301
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1302
+ std::string piece;
1303
+ for (int32_t i = 0; i < piece_length; ++i) {
1304
+ piece += unicode_cpt_to_utf8(cpts[i]);
1305
+ }
1306
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1307
+ pieces_for_suffix++;
1308
+ }
1309
+ }
1310
+ num_pieces += pieces_for_suffix;
1311
+ } else {
1312
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
1313
+ }
1314
+ }
1315
+
1316
+ // Build flattened table
1317
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1318
+ int32_t table_idx = 0;
1319
+
1320
+ for (const auto & suffix : suffixes) {
1321
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
1322
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1323
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1324
+ std::string piece;
1325
+ for (int32_t i = 0; i < piece_length; ++i) {
1326
+ piece += unicode_cpt_to_utf8(cpts[i]);
1327
+ }
1328
+
1329
+ auto score_it = suffix_to_score.find(piece);
1330
+ if (score_it == suffix_to_score.end()) {
1331
+ continue;
1332
+ }
1333
+
1334
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1335
+ auto token_it = token_to_id.find(piece);
1336
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1337
+
1338
+ float score = score_it->second;
1339
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1340
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1341
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1342
+
1343
+ table_idx++;
1344
+ }
1345
+
1346
+ // Add sentinel row
1347
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1348
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
1349
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1350
+ table_idx++;
1351
+ }
1352
+ }
1353
+
1354
+ std::vector<llama_token> encode(const std::string & text) const {
1355
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1356
+ // Skip the first code point if it is a BOM (Byte Order Mark)
1357
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1358
+ unicode_data.erase(unicode_data.begin());
1359
+ }
1360
+
1361
+ if (unicode_data.empty()) {
1362
+ return {};
1363
+ }
1364
+
1365
+ const size_t data_len = unicode_data.size();
1366
+
1367
+ // Initialize scores array (dynamic programming)
1368
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1369
+ scores[data_len] = 0;
1370
+
1371
+ // Path array to track best tokenization
1372
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1373
+
1374
+ int32_t suffix_id = 0;
1375
+
1376
+ // Process from end to beginning
1377
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1378
+ uint32_t c = unicode_data[i];
1379
+
1380
+ // Find next suffix ID
1381
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1382
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1383
+ auto it = to_suffix_id_.find(piece_code);
1384
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1385
+
1386
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1387
+ break;
1388
+ }
1389
+ }
1390
+
1391
+ // Update best path
1392
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1393
+ int32_t score = table_[p][TABLE_SCORE];
1394
+ if (score > INVALID_SCORE) {
1395
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1396
+ int64_t s = scores[i + piece_length] - score;
1397
+
1398
+ if (s < scores[i]) {
1399
+ scores[i] = s;
1400
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
1401
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1402
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1403
+
1404
+ if (score == UNKNOWN_SCORE) {
1405
+ // Add UTF-8 byte count
1406
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1407
+ }
1408
+ }
1409
+ }
1410
+
1411
+ if (score == UNKNOWN_SCORE) {
1412
+ break;
1413
+ }
1414
+ }
1415
+ }
1416
+
1417
+ // Decode the best path
1418
+ std::vector<llama_token> token_ids;
1419
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1420
+
1421
+ int pos = 0;
1422
+ while (pos < static_cast<int>(data_len)) {
1423
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
1424
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1425
+ } else {
1426
+ // Fall back to byte tokens
1427
+ uint32_t c = unicode_data[pos];
1428
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1429
+
1430
+ for (int i = 0; i < s; ++i) {
1431
+ uint8_t b;
1432
+ if (s == 1) {
1433
+ b = c;
1434
+ } else {
1435
+ if (i == 0) {
1436
+ b = (0xF00 >> s) & 0xFF;
1437
+ } else {
1438
+ b = 0x80;
1439
+ }
1440
+ }
1441
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1442
+ }
1443
+ }
1444
+
1445
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1446
+ pos += path[pos][PATH_TOKEN_LENGTH];
1447
+ }
1448
+
1449
+ return token_ids;
1450
+ }
1451
+ private:
1452
+ // Constants for table structure
1453
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1454
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
1455
+ static constexpr int32_t TABLE_SCORE = 2;
1456
+ static constexpr int32_t TABLE_PIECE_ID = 3;
1457
+
1458
+ // Constants for path array
1459
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1460
+ static constexpr int32_t PATH_TOKEN_ID = 1;
1461
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
1462
+
1463
+ // Score constants
1464
+ static constexpr int32_t INVALID_SCORE = -20000000;
1465
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
1466
+
1467
+ // List of tokens in the vocabulary
1468
+ std::vector<std::string> tokens_;
1469
+
1470
+ // Mapping from byte code point to token ID (for byte fallback)
1471
+ std::vector<llama_token> bytes_;
1472
+
1473
+ // Mapping from piece code to suffix ID
1474
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
1475
+
1476
+ // Flattened table representing the Trie structure
1477
+ // Each row contains: [piece_length, token_id, score, piece_id]
1478
+ std::vector<std::vector<int32_t>> table_;
1479
+ };
1480
+
1481
+ struct llm_tokenizer_plamo2_session {
1482
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1483
+
1484
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1485
+ std::vector<llama_token> tokens = tokenizer.encode(text);
1486
+ output.insert(output.end(), tokens.begin(), tokens.end());
1487
+ }
1488
+
1489
+ private:
1490
+ const llm_tokenizer_plamo2 & tokenizer;
1491
+ };
1492
+
1198
1493
  //
1199
1494
  // impl
1200
1495
  //
@@ -1477,7 +1772,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1477
1772
  const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1478
1773
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1479
1774
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1480
- #ifdef IS_BIG_ENDIAN
1775
+ #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1481
1776
  // correct endiannes of data in precompiled_charsmap binary blob
1482
1777
  uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1483
1778
  *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@@ -1498,6 +1793,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1498
1793
  special_unk_id = LLAMA_TOKEN_NULL;
1499
1794
  special_sep_id = LLAMA_TOKEN_NULL;
1500
1795
  special_pad_id = LLAMA_TOKEN_NULL;
1796
+ } else if (tokenizer_model == "plamo2") {
1797
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
1798
+
1799
+ // PLaMo-2 default special tokens (these will be overridden by model config)
1800
+ special_bos_id = 1; // <|plamo:bos|>
1801
+ special_eos_id = 2; // <|plamo:eos|>
1802
+ special_unk_id = 0; // <|plamo:unk|>
1803
+ special_sep_id = LLAMA_TOKEN_NULL;
1804
+ special_pad_id = 3; // <|plamo:pad|>
1805
+ special_mask_id = LLAMA_TOKEN_NULL;
1501
1806
  } else {
1502
1807
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1503
1808
  }
@@ -1522,7 +1827,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1522
1827
  tokenizer_pre == "llama-v3" ||
1523
1828
  tokenizer_pre == "llama-bpe"||
1524
1829
  tokenizer_pre == "falcon3" ||
1525
- tokenizer_pre == "pixtral") {
1830
+ tokenizer_pre == "falcon-h1" ||
1831
+ tokenizer_pre == "pixtral" ||
1832
+ tokenizer_pre == "midm-2.0" ||
1833
+ tokenizer_pre == "lfm2") {
1526
1834
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1527
1835
  ignore_merges = true;
1528
1836
  add_bos = true;
@@ -1554,7 +1862,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1554
1862
  tokenizer_pre == "jina-de" ||
1555
1863
  tokenizer_pre == "gigachat" ||
1556
1864
  tokenizer_pre == "jina-v2-es" ||
1557
- tokenizer_pre == "jina-v2-de") {
1865
+ tokenizer_pre == "jina-v2-de" ||
1866
+ tokenizer_pre == "a.x-4.0" ||
1867
+ tokenizer_pre == "mellum") {
1558
1868
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559
1869
  } else if (
1560
1870
  tokenizer_pre == "jina-v1-en" ||
@@ -1624,6 +1934,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1624
1934
  } else if (
1625
1935
  tokenizer_pre == "exaone") {
1626
1936
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1937
+ } else if (
1938
+ tokenizer_pre == "exaone4") {
1939
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1627
1940
  } else if (
1628
1941
  tokenizer_pre == "chameleon") {
1629
1942
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -1649,13 +1962,30 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1649
1962
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1650
1963
  clean_spaces = false;
1651
1964
  } else if (
1652
- tokenizer_pre == "bailingmoe") {
1965
+ tokenizer_pre == "bailingmoe" ||
1966
+ tokenizer_pre == "llada-moe") {
1653
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1654
1968
  clean_spaces = false;
1655
1969
  } else if (
1656
1970
  tokenizer_pre == "seed-coder") {
1657
1971
  pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1658
1972
  clean_spaces = false;
1973
+ } else if (
1974
+ tokenizer_pre == "hunyuan") {
1975
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1976
+ clean_spaces = false;
1977
+ } else if (
1978
+ tokenizer_pre == "hunyuan-dense") {
1979
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1980
+ clean_spaces = false;
1981
+ } else if (
1982
+ tokenizer_pre == "kimi-k2") {
1983
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1984
+ clean_spaces = false;
1985
+ } else if (
1986
+ tokenizer_pre == "grok-2") {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1988
+ clean_spaces = false;
1659
1989
  } else {
1660
1990
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1661
1991
  }
@@ -1839,6 +2169,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1839
2169
  || t.first == "<EOT>"
1840
2170
  || t.first == "_<EOT>"
1841
2171
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
2172
+ || t.first == "<end_of_utterance>" // smoldocling
1842
2173
  ) {
1843
2174
  special_eot_id = t.second;
1844
2175
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1872,6 +2203,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1872
2203
  || t.first == "<|fim▁begin|>" // DeepSeek
1873
2204
  || t.first == "<PRE>"
1874
2205
  || t.first == "▁<PRE>" // CodeLlama
2206
+ || t.first == "<|code_prefix|>" // GLM-4.5
1875
2207
  ) {
1876
2208
  special_fim_pre_id = t.second;
1877
2209
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1891,6 +2223,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1891
2223
  || t.first == "<|fim▁hole|>" // DeepSeek
1892
2224
  || t.first == "<SUF>"
1893
2225
  || t.first == "▁<SUF>" // CodeLlama
2226
+ || t.first == "<|code_suffix|>" // GLM-4.5
1894
2227
  ) {
1895
2228
  special_fim_suf_id = t.second;
1896
2229
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1910,6 +2243,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1910
2243
  || t.first == "<|fim▁end|>" // DeepSeek
1911
2244
  || t.first == "<MID>"
1912
2245
  || t.first == "▁<MID>" // CodeLlama
2246
+ || t.first == "<|code_middle|>" // GLM-4.5
1913
2247
  ) {
1914
2248
  special_fim_mid_id = t.second;
1915
2249
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1992,12 +2326,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1992
2326
  || t.first == "<|eot_id|>"
1993
2327
  || t.first == "<|im_end|>"
1994
2328
  || t.first == "<|end|>"
2329
+ || t.first == "<|return|>" // o200k_harmony
2330
+ || t.first == "<|call|>" // o200k_harmony
1995
2331
  || t.first == "<end_of_turn>"
1996
2332
  || t.first == "<|endoftext|>"
1997
2333
  || t.first == "<|eom_id|>"
1998
2334
  || t.first == "<EOT>"
1999
2335
  || t.first == "_<EOT>"
2000
2336
  || t.first == "<|end_of_text|>"
2337
+ || t.first == "<end_of_utterance>" // smoldocling
2001
2338
  ) {
2002
2339
  special_eog_ids.insert(t.second);
2003
2340
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2014,6 +2351,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2014
2351
  }
2015
2352
  }
2016
2353
 
2354
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2355
+ for (const auto & t : token_to_id) {
2356
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2357
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2358
+ }
2359
+ }
2360
+
2017
2361
  // sanity checks
2018
2362
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2019
2363
  special_eog_ids.insert(special_eos_id);
@@ -2029,6 +2373,37 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2029
2373
  special_eog_ids.insert(special_eom_id);
2030
2374
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2031
2375
  }
2376
+
2377
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2378
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2379
+ // we remove the "<|end|>" token from the EOG list
2380
+ {
2381
+ bool has_return = false;
2382
+ bool has_call = false;
2383
+ bool has_end = false;
2384
+
2385
+ llama_token end_id = LLAMA_TOKEN_NULL;
2386
+
2387
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2388
+ for (auto tid : special_eog_ids) {
2389
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2390
+
2391
+ if (id_to_token[tid].text == "<|return|>") {
2392
+ has_return = true;
2393
+ } else if (id_to_token[tid].text == "<|call|>") {
2394
+ has_call = true;
2395
+ } else if (id_to_token[tid].text == "<|end|>") {
2396
+ has_end = true;
2397
+ end_id = tid;
2398
+ }
2399
+ }
2400
+
2401
+ if (has_return && has_call && has_end) {
2402
+ special_eog_ids.erase(end_id);
2403
+ id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2404
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2405
+ }
2406
+ }
2032
2407
  }
2033
2408
 
2034
2409
  // build special tokens cache
@@ -2107,7 +2482,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2107
2482
  // set attributes by model/tokenizer/architecture name
2108
2483
  if (false
2109
2484
  || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2110
- || _contains_any(general_arch, {"nomic-bert-moe"})
2485
+ || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2111
2486
  ) {
2112
2487
  if (token_to_id.count("<mask>") == 0) {
2113
2488
  LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
@@ -2134,13 +2509,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
2134
2509
 
2135
2510
  std::string llama_vocab::impl::type_name() const{
2136
2511
  switch (type) {
2137
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2138
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2139
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2140
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2141
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2142
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2143
- default: return "unknown";
2512
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2513
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2514
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2515
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2516
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2517
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2518
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2519
+ default: return "unknown";
2144
2520
  }
2145
2521
  }
2146
2522
 
@@ -2223,6 +2599,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2223
2599
  case LLAMA_VOCAB_TYPE_RWKV:
2224
2600
  tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2225
2601
  break;
2602
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2603
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2604
+ break;
2226
2605
  default:
2227
2606
  GGML_ABORT("unsupported vocab type");
2228
2607
  }
@@ -2555,6 +2934,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2555
2934
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2556
2935
  std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2557
2936
 
2937
+ #ifdef PRETOKENIZERDEBUG
2938
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2939
+ #endif
2940
+
2941
+ session.tokenize(text, output);
2942
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2943
+ output.push_back(fragment.token);
2944
+ }
2945
+ }
2946
+ } break;
2947
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2948
+ {
2949
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2950
+ for (const auto & fragment : fragment_buffer) {
2951
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2952
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2953
+
2558
2954
  #ifdef PRETOKENIZERDEBUG
2559
2955
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2560
2956
  #endif
@@ -2653,6 +3049,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2653
3049
  memcpy(buf, result.data(), result.size());
2654
3050
  return (int)result.size();
2655
3051
  }
3052
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3053
+ // PLaMo-2 uses similar token handling as BPE/SPM
3054
+ if (vocab.is_byte(token)) {
3055
+ // Handle byte tokens like <0xXX>
3056
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3057
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3058
+ if (length < 1) {
3059
+ return -1;
3060
+ }
3061
+ buf[0] = static_cast<char>(hex_val);
3062
+ return 1;
3063
+ }
3064
+ }
3065
+
3066
+ // Normal token - just copy the text
3067
+ std::string result = token_text;
3068
+ return _try_copy(result.data(), result.size());
3069
+ }
2656
3070
  default:
2657
3071
  GGML_ABORT("fatal error");
2658
3072
  }
@@ -2897,6 +3311,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
2897
3311
  case LLAMA_VOCAB_TYPE_BPE: {
2898
3312
  return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
2899
3313
  }
3314
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3315
+ // PLaMo-2 uses byte tokens in format <0xXX>
3316
+ char hex_str[8];
3317
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3318
+ return pimpl->token_to_id.at(hex_str);
3319
+ }
2900
3320
  default:
2901
3321
  GGML_ABORT("fatal error");
2902
3322
  }
@@ -2998,6 +3418,10 @@ llama_token llama_vocab::token_fim_sep() const {
2998
3418
  return pimpl->special_fim_sep_id;
2999
3419
  }
3000
3420
 
3421
+ llama_token llama_vocab::token_mask() const {
3422
+ return pimpl->special_mask_id;
3423
+ }
3424
+
3001
3425
  bool llama_vocab::get_add_space_prefix() const {
3002
3426
  return pimpl->add_space_prefix;
3003
3427
  }
@@ -3238,6 +3662,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3238
3662
  return vocab->token_fim_sep();
3239
3663
  }
3240
3664
 
3665
+ llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3666
+ return vocab->token_mask();
3667
+ }
3668
+
3241
3669
  // deprecated
3242
3670
  const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3243
3671
  return llama_vocab_get_text(vocab, token);
@@ -3374,4 +3802,3 @@ int32_t llama_detokenize(
3374
3802
  bool unparse_special) {
3375
3803
  return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3376
3804
  }
3377
-